/*
 * LIUS - Lucene Index Update and Search
 * http://sourceforge.net/projects/lius/
 *
 * Copyright (c) 2004, Laval University Library.  All rights reserved.
 *
 * This program is a free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of
 * the License, or (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package ca.ulaval.bibl.lius.index.HTML;

/**
 * <p>Titre : Lius (Lucene Index, Update and Search)</p>
 * <p>Description : Application Java permettant d'indexer du XML, PDF, HTML, JSP, ASP, PHP, Word, Excel et des objets Java</p>
 * <p>Copyright : Copyright (c) 2003 Rida Benjelloun</p>
 * @author Rida Benjelloun
 * @e-mail rbenjelloun@hotmail.com
 *         rida.benjelloun@bibl.ulaval.ca
 * @version 0.0.1
 * @date 09-08-2003
 */

import java.util.Collection;
import org.cyberneko.html.parsers.DOMParser;
//import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import java.io.IOException;
import org.jdom.input.DOMBuilder;
import org.jdom.JDOMException;

import java.io.File;

import ca.ulaval.bibl.lius.config.LiusConfig;

import ca.ulaval.bibl.lius.index.XML.XmlFileIndexer;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileOutputStream;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import ca.ulaval.bibl.lius.config.LiusConfigBuilder;



/**
 * Classe permettant d'indexer des fichiers HTML
 * <br/><br/>
 * Class for indexing HTML files.
 * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
 */

public class HtmlIndexer
    extends XmlFileIndexer {
  private String fileToDelete = null;


  public Object parse(Object file) {
    org.jdom.Document jdomDoc = null;
    try {
      String newFile = omitXMLDeclaration((String)file);
      DOMParser parser = new DOMParser();
      parser.parse(newFile);
      org.w3c.dom.Document domDoc = parser.getDocument();
      jdomDoc = convert(domDoc);
    }
    catch (SAXException e) {
      e.printStackTrace();
    }
    catch (IOException e) {
      e.printStackTrace();
    }
    catch (JDOMException e) {
      e.printStackTrace();
    }
    catch (Exception e) {
      e.printStackTrace();
    }

    return jdomDoc;
  }

   public org.jdom.Document convert(org.w3c.dom.Document domDoc) throws
      JDOMException, IOException {
    DOMBuilder builder = new DOMBuilder();
    org.jdom.Document jdomDoc = builder.build(domDoc);
    return jdomDoc;
  }

  /**
   * Mthode retournant un objet de type Lucene document  partir du fichier
   *  indexer et du fichier de configuration de Lius exprim sous forme
   * d'objet de type LiusConfig.
   * <br/><br/>
   * Method that returns a Lucene Document object from a file to index and
   * the Lius configuration as a LiusConfig object.
   */
  public org.apache.lucene.document.Document createLuceneDocument(String file,
      LiusConfig lc) {

    org.apache.lucene.document.Document doc = createLuceneDocument(file,
        lc.getHtmlFields());
    return doc;
  }

  /**
   * Mthode retournant un objet de type Lucene document  partir du fichier 
   * indexer et d'une collection d'objets de type LiusField. Chaque objet
   * LiusField contient de l'information sur le nom du champs Lucene, le type,
   * etc.
   * <br/><br/>
   * Method that returns a Lucene object from the configuration file and a collection
   * of LiusField objects. Each LiusField object contains information about the Lucene
   * field, the type, etc.
   */
  public Collection getPopulatedCollection(Object file, Collection liusFields) {
    org.jdom.Document jdomDoc = (org.jdom.Document) this.parse(file);
    Collection coll = super.getPopulatedCollection(jdomDoc, liusFields);
    System.out.println(fileToDelete);
    File toDelete = new File(fileToDelete);
    toDelete.delete();
    return coll;
  }

  /**
  * Permet de rcuprer les champs de Lius  partir du fichier de configuration
   * pour effectuer l'indexation.
   * <br/><br/>
   * Get Lius fiels from the configuration file for indexation.
   */
  public Collection getLiusFields(LiusConfig lc) {
    return lc.getHtmlFields();
  }

  public String omitXMLDeclaration(String file) throws
      FileNotFoundException, IOException {
    String line = null;
    String extension = file.substring(file.lastIndexOf("."));
    String base = file.substring(0,file.lastIndexOf("."));
    String fOut = base+"_liusTmpFileToIndex"+extension;
    FileInputStream fis = new FileInputStream(file);
    FileOutputStream fos = new FileOutputStream(fOut);
    BufferedReader in = new BufferedReader(new InputStreamReader(fis));
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fos));
     while ( (line = in.readLine()) != null) {
       if(line.startsWith("<?xml")){
         int offset = line.indexOf("?>");
         out.write(line.substring(offset+2));
            //new String((line.substring(offset+2)).getBytes(), "UTF8"));
       }
       else{
          out.write(line);
            //new String(line.getBytes(),"UTF8"));
       }
     }
     out.close();
     fileToDelete = fOut;

     return fOut;
  }
  public Collection getPopulatedCollection(Object file, String liusConfig) {
   LiusConfig lc = LiusConfigBuilder.getSingletonInstance().getLiusConfig(
       liusConfig);
   return getPopulatedCollection(file, lc);
 }

 public Collection getPopulatedCollection(Object file, LiusConfig lc) {
   return getPopulatedCollection(file, lc.getHtmlFields());
 }


}