/*
 * $Header: /home/cvspublic/jakarta-slide/src/share/org/apache/slide/extractor/SimpleXmlExtractor.java,v 1.12 2005/04/04 13:43:17 luetzkendorf Exp $
 * $Revision: 1.12 $
 * $Date: 2005/04/04 13:43:17 $
 *
 * ====================================================================
 *
 * Copyright 2004 The Apache Software Foundation 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.apache.slide.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.slide.common.PropertyName;
import org.apache.slide.util.conf.Configurable;
import org.apache.slide.util.conf.Configuration;
import org.apache.slide.util.conf.ConfigurationException;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.Text;
import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;

/**
 * Property extractor for XML files that maps XPath expressions to WebDAV properties.
 *
 * <p>For each property to be extracted a XPath expression must be specified. The XPath 
 * is evaluated while extraction against the XML document. The first node of the resulting
 * node set is used to determine the property value. For <code>text</code> nodes the text 
 * becomes the properties value, for <code>attribute</code> nodes the attribute value and for
 * <code>element</code> the concatenation of all descendant <code>text</code> nodes. 
 *
 * <p><b>Sample</b>: 
 * <pre>
 *   &lt;extractor classname="org.apache.slide.extractor.SimpleXmlExtractor" 
 *              content-type="text/xml" uri="/files/xdocs">
 *      &lt;configuration>
 *         &lt;instruction property="id" namespace="http://mycomp.com/namepsaces/webdav" xpath="/xdoc/@id"/>
 *         &lt;instruction property="title" namespace="http://mycomp.com/namepsaces/webdav" xpath="/xdoc/head/title"/>
 *         &lt;instruction property="date" namespace="http://mycomp.com/namepsaces/webdav" xpath="/xdoc/date"/>
 *      &lt;/configuration>
 *   &lt;/extractor>
 * </pre>
 * 
 * <p><b>Sample with XML namespaces</b>: 
 * <pre>
 *   &lt;extractor classname="org.apache.slide.extractor.SimpleXmlExtractor" 
 *              content-type="text/xml" uri="/files/xdocs">
 *      &lt;configuration>
 *         &lt;xmlnamespace prefix="pre" uri="http://mycomp.com/namepsaces/xmldocs"/>
 *         &lt;instruction property="id" namespace="http://mycomp.com/namepsaces/webdav" xpath="/pre:xdoc/@id"/>
 *         &lt;instruction property="title" namespace="http://mycomp.com/namepsaces/webdav" xpath="/pre:xdoc/pre:head/pre:title"/>
 *         &lt;instruction property="date" namespace="http://mycomp.com/namepsaces/webdav" xpath="/pre:xdoc/pre:date"/>
 *      &lt;/configuration>
 *   &lt;/extractor>
 * </pre>
 */
public class SimpleXmlExtractor extends AbstractPropertyExtractor implements Configurable {
	
	static final String CONTENT_TYPE_XML = "text/xml";
	static final String CONTENT_TYPE_XHTML = "application/xhtml+xml";
	//html also because xhtml can and most often has the html content type
	static final String CONTENT_TYPE_HTML = "text/html";
	static final String CONTENT_TYPE_XML_ALL_CSV = CONTENT_TYPE_XML+","+CONTENT_TYPE_XHTML+","+CONTENT_TYPE_HTML;
	
    protected List instructions = new ArrayList();
    protected List namespaces = new ArrayList();

    public SimpleXmlExtractor(String uri, String contentType, String namespace) {
        super(uri, contentType, namespace);
    }

    public Map extract(InputStream content) throws ExtractorException {
        Map properties = new HashMap();
        try {
            SAXBuilder saxBuilder = new SAXBuilder();
            Document document = saxBuilder.build(content);
            for (Iterator i = instructions.iterator(); i.hasNext();) {
                Instruction instruction = (Instruction) i.next();
                XPath xPath = instruction.getxPath();
                List nodeList = xPath.selectNodes(document);
                Object propertyValue = filter(nodeList, instruction);
                if (propertyValue != null) {
                    properties.put(instruction.getPropertyName(), propertyValue);
                }
            }
        } catch (IOException e) {
            throw new ExtractorException("Exception while retrieving content");
        } catch (JDOMException e) {
            throw new ExtractorException("Exception while parsing content. XML document must be wellformed.");
        }
        return properties;
    }

    public void configure(Configuration conf) throws ConfigurationException {
    	for(Enumeration e = conf.getConfigurations("xmlnamespace"); e.hasMoreElements();) {
        	Configuration xpathNamespace = (Configuration)e.nextElement();
        	this.namespaces.add(Namespace.getNamespace(xpathNamespace.getAttribute("prefix"), 
									        		   xpathNamespace.getAttribute("uri")));
        }
        for(Enumeration e = conf.getConfigurations("instruction");e.hasMoreElements();) {
            Configuration instruction = (Configuration) e.nextElement();
            addInstruction(createInstruction(instruction));
        }
    }

    /**
     * Allow subclasses to apply filtering to property values before they are written.
     * Returning null signals that the extractor ignors this value.
     * 
     * @param text  the Node List identified by the xpath instruction.
     * @return  the property value to be set, <code>null</code> if to be ignored.
     */
    protected Object filter(List nodeList, Instruction instruction) throws ExtractorException {
        if (nodeList.size() > 0) {
            Object node = nodeList.get(0);
			if (node instanceof Text) {
                return ((Text) node).getText();
            } else if (node instanceof Attribute) {
                return ((Attribute) node).getValue();
            } else if (node instanceof String) {
                return node;
            } else if (node instanceof Element) {
            	StringBuffer text = new StringBuffer();
            	getElementText(node, text);
                return text.toString();
            }
        }
        return null;
    }
    
    private void getElementText(Object o, StringBuffer buffer) {
    	if (o instanceof Element) {
	        List list = ((Element)o).getContent();
	        for (int i = 0, l = list.size(); i < l; i++) {
	        	Object subNode = list.get(i);
	        	getElementText(subNode, buffer);
	        }
    	} else if (o instanceof Text) {
    		buffer.append(((Text)o).getText());
    	} else {
    		// ignore other stuff
    	}
    }

    protected void addInstruction(Instruction instruction) {
        instructions.add(instruction);
    }

    protected Instruction createInstruction(Configuration instruction) 
    		throws ConfigurationException {
        try {
            String property = instruction.getAttribute("property");
            String propertyNamespace = instruction.getAttribute("namespace", "DAV:");
            XPath xPath = XPath.newInstance(instruction.getAttribute("xpath"));
            for(Iterator i = namespaces.iterator(); i.hasNext();) {
            	xPath.addNamespace((Namespace)i.next());
            }
            return new Instruction(xPath, PropertyName.getPropertyName(property, 
            		propertyNamespace));
        } catch (JDOMException e) {
            throw new ConfigurationException("Could not create xPath from given attribute", 
            		instruction);
        }
    }
    
    protected static class Instruction {

        private XPath xPath;
        private PropertyName propertyName;

        public Instruction(XPath xPath, PropertyName property) {
            this.xPath = xPath;
            this.propertyName = property;
        }

        public XPath getxPath() {
            return xPath;
        }

        public PropertyName getPropertyName() {
            return propertyName;
        }
    }

	/* (non-Javadoc)
	 * @see org.apache.slide.extractor.Extractor#getContentType()
	 */
	public String getContentType() {
		if(super.getContentType()==null){
			return CONTENT_TYPE_XML_ALL_CSV;
		}	
		return super.getContentType();
	}
}
