package org.apache.slide.extractor;

import java.io.InputStream;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.Section;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.slide.common.PropertyName;
import org.apache.slide.util.conf.Configurable;
import org.apache.slide.util.conf.Configuration;
import org.apache.slide.util.conf.ConfigurationException;

/**
 * Property extractor for Microsoft office documents.
 * 
 * <p>This property extractor extracts properties from <code>SummaryInformation</code> and
 * <code>DocumentSummaryInformation</code> headers of office documents.
 * 
 * <p>Sample configuration:
 * <pre>
 *   &lt;extractor classname="org.apache.slide.extractor.OfficeExtractor" uri="/files/docs/">
 *     &lt;configuration>
 *       &lt;instruction property="author" namespace="http://mycomp.com/namepsaces/webdav" summary-information="4" />
 *       &lt;instruction property="application" namespace="http://mycomp.com/namepsaces/webdav" summary-information="18" />
 *       &lt;instruction property="title" namespace="http://mycomp.com/namepsaces/webdav" summary-information="2" />
 *       &lt;instruction property="category" namespace="http://mycomp.com/namepsaces/webdav" document-summary-information="2" />
 *       &lt;instruction property="docid" namespace="http://mycomp.com/namepsaces/webdav" label="Document-ID" />
 *     &lt;/configuration>
 *   &lt;/extractor>
 * </pre>
 * The sample configuration
 * <ul> 
 * <li>maps the <em>author</em> info of office documents to the <code>author</code> 
 * property. The author info can be found in the <code>SummaryInformation</code> header and
 * has the <code>id</code> 4.
 * <li>and maps the <em>category</em> entry of the <code>DocumentSummaryInformation</code> header, 
 * which has the <code>id</code> 2 to the WebDAV property <code>category</code>.
 * <li><code>SummaryInformation</code> headers can also contain "labled" entries, e.g. for user
 * defined metadata. In the sample the labled entries with the label <code>Document-ID</code>
 * will be mapped to the WebDAV-Property <code>docid</code>.
 * </ul>
 * All WebDAV properties in the sample will have the namespace 
 * <code>http://mycomp.com/namepsaces/webdav</code>.
 * 
 * <p>The IDs in the <code>DocumentSummaryInformation</code> and <code>SummaryInformation</code>
 * headers are somewhat mystical. Samples for <code>SummaryInformation</code> are:
 * <pre>
 *    1: codepage
 *    2: title
 *    3: theme
 *    4: author
 *    5: keywords
 *    6: comments
 *    7: template (e.g. Normal.dot"
 *    8: last author
 *    9: revision number
 *   11: last printing date
 *   12: creation date
 *   13: last saved date
 *   14: number of pages
 *   15: number of words
 *   16: number of characters
 *   18: application name (e.g. "Microsoft Word 9.0")
 *   19: 
 * </pre>
 * Samples for <code>DocumentSummaryInformation</code> are:
 * <pre>
 *    1: codepage
 *    2: category
 *    5: number of lines
 *    6: number of paragraphs
 *   14: manager
 *   15: company
 * </pre>
 */
public class OfficeExtractor extends AbstractPropertyExtractor implements Configurable {
	// maps SummaryInformation IDs to PropertyNames 
	protected Map propertyMapSI = new HashMap();
	// maps DocumentSummaryInformation IDs to PropertyNames
	protected Map propertyMapDSI = new HashMap();
	// maps labled properties to PropertyNames
	protected Map propertyMapLbl = new HashMap();
	
	static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = 
		MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV + "," +
		MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV + "," +
		MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
	
	public OfficeExtractor(String uri, String contentType, String namespace) {
		super(uri, contentType, namespace);
	}

	public Map extract(InputStream content) throws ExtractorException {
		OfficePropertiesListener listener = new OfficePropertiesListener();
		try {
			POIFSReader r = new POIFSReader();
			r.registerListener(listener);
			r.read(content);
		} catch (Exception e) {
			throw new ExtractorException("Exception while extracting properties in OfficeExtractor: " + e);
		}
		return listener.getProperties();
	}

	class OfficePropertiesListener implements POIFSReaderListener {

		private HashMap extractedProperties = new HashMap();

		public Map getProperties() {
			return extractedProperties;
		}

		public void processPOIFSReaderEvent(POIFSReaderEvent event) {
			PropertySet ps = null;
			try {
				ps = PropertySetFactory.create(event.getStream());
			} catch (NoPropertySetStreamException ex) {
				return;
			} catch (Exception ex) {
				throw new RuntimeException("Property set stream \"" + event.getPath() + event.getName() + "\": " + ex);
			}

			Map idMap = null;
			
			if (ps.isDocumentSummaryInformation()) {
				idMap = propertyMapDSI;
			} else if (ps.isSummaryInformation()) {
				idMap = propertyMapSI;
			} else {
				// can this happen?
				idMap = Collections.EMPTY_MAP;
			}
			
			List sections = ps.getSections();

			for (Iterator i = sections.iterator(); i.hasNext();) {
				Section sec = (Section) i.next();
				System.out.println("section: " + sec);
				
				if (sec.getProperty(0) == null) {
					for(Iterator j = idMap.entrySet().iterator(); j.hasNext();) {
						Map.Entry e = (Map.Entry)j.next();
						
						Object propertyValue = sec.getProperty(((Integer)e.getKey()).intValue());
						if (propertyValue != null) {
							//System.out.println("\t" + e.getValue() + "=" + propertyValue);
							extractedProperties.put(e.getValue(), propertyValue);
						}
					}
				} else {
					Map dict = (Map)sec.getProperty(0);
					// this section has a dictionary
					Property property[] = sec.getProperties();
					for(int j = 0; j < property.length; j++) {
						//String label = sec.getPIDString(property[j].getID()); TODO why doesn't this work
						String label = (String)dict.get(new Long(property[j].getID()));
						PropertyName slideProperty = (PropertyName)propertyMapLbl.get(label);
						if (slideProperty != null) {
							//System.out.println("\t" + slideProperty + "=" + property[j].getValue());
							extractedProperties.put(slideProperty, property[j].getValue());
						}
					}
				}
			}
		}
	}

	public void configure(Configuration configuration) throws ConfigurationException {
        Enumeration instructions = configuration.getConfigurations("instruction");
        while (instructions.hasMoreElements()) {
            Configuration instruction = (Configuration)instructions.nextElement();
            PropertyName propertyName = PropertyName.getPropertyName(
            		instruction.getAttribute("property"),
					instruction.getAttribute("namespace", "DAV:"));
            
            try {
	            String id = instruction.getAttribute("summary-information", null);
	            if (id != null) {
	            	this.propertyMapSI.put(Integer.valueOf(id), propertyName);
	            	continue;
	            }
	            
	            id = instruction.getAttribute("document-summary-information", null);
	            if (id != null) {
	            	this.propertyMapDSI.put(Integer.valueOf(id), propertyName);
	            	continue;
	            }
	            
	            id = instruction.getAttribute("label", null);
	            if (id != null) {
	            	this.propertyMapLbl.put(id, propertyName);
	            	continue;
	            }
	            
	            // for backward compatibility
	            // old style id atributes like SummaryInformation-0-4
	            id = instruction.getAttribute("id", null);
	            if (id != null) {
	            	Integer intId = Integer.valueOf(id.substring(id.lastIndexOf('-')+1));
	            	if (id.startsWith("SummaryInformation")) {
	            		this.propertyMapSI.put(intId, propertyName);
	            	}
	            	if (id.startsWith("DocumentSummaryInformation")) {
	            		this.propertyMapDSI.put(intId, propertyName);
	            	}
	            }
            } catch(NumberFormatException e) {
            	throw new ConfigurationException("Invalid instruction: " + e, instruction);
            }
        }
	}
	
	/* (non-Javadoc)
	 * @see org.apache.slide.extractor.Extractor#getContentType()
	 */
	public String getContentType() {
		if(super.getContentType()==null){
			return CONTENT_TYPE_MS_OFFICE_ALL_CSV;
		}
		return super.getContentType();
	}
	
}