/*
 * $Header: /home/cvspublic/jakarta-slide/src/share/org/apache/slide/extractor/MSWordExtractor.java,v 1.3 2005/01/14 18:34:13 luetzkendorf Exp $
 * $Revision: 1.3 $
 * $Date: 2005/01/14 18:34:13 $
 *
 * ====================================================================
 *
 * Copyright 2004 The Apache Software Foundation 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.apache.slide.extractor;

import java.io.FileInputStream;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import org.textmining.text.extraction.WordExtractor;

/**
 * Content extractor for Microsoft Word documents.
 */
public class MSWordExtractor extends AbstractContentExtractor {
	
    static final String CONTENT_TYPE_WORD_1 = "application/msword";
    static final String CONTENT_TYPE_WORD_2 = "application/vnd.ms-word";
    static final String CONTENT_TYPE_WORD_ALL_CSV = CONTENT_TYPE_WORD_1+","+CONTENT_TYPE_WORD_2;

    public MSWordExtractor(String uri, String contentType, String namespace) {
        super(uri, contentType, namespace);
    }

    public Reader extract(InputStream content)  throws ExtractorException {
        try {
            WordExtractor  extractor = new WordExtractor();
            String text = extractor.extractText(content);          

            StringReader reader = new StringReader(text);
            return reader;
        }
        catch(Exception e) {
            throw new ExtractorException(e.getMessage());
        }
    }

        public static void main(String[] args) throws Exception
        {
            FileInputStream in = new FileInputStream(args[0]);

            MSWordExtractor ex = new MSWordExtractor(null, null, null);

            Reader reader = ex.extract(in);

            int c;
            do
            {
                c = reader.read();

                System.out.print((char)c);
            }
            while( c != -1 );
        }
        
    	/* (non-Javadoc)
    	 * @see org.apache.slide.extractor.Extractor#getContentType()
    	 */
    	public String getContentType() {
    		if(super.getContentType()==null){
    			return CONTENT_TYPE_WORD_ALL_CSV;
    		}
    		return super.getContentType();
    	}
    	
}