/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.Function;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.process.WordToTaggedWordProcessor;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.web.HTMLParser;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class DocumentPreprocessor {
    private static final boolean DEBUG = false;
    private TokenizerFactory tokenizerFactory;
    private String encoding;
    private String[] sentenceFinalPuncWords;
    private Pattern urlPattern = Pattern.compile(".*?tp://.*?");
    static final int plain = 0;
    static final int xml = 1;
    static final int html = 2;

    public DocumentPreprocessor(TokenizerFactory tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    public DocumentPreprocessor() {
        this.tokenizerFactory = PTBTokenizer.factory();
    }

    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
        this.sentenceFinalPuncWords = sentenceFinalPuncWords;
    }

    public void setTokenizerFactory(TokenizerFactory newTokenizerFactory) {
        this.tokenizerFactory = newTokenizerFactory;
    }

    public void usePTBTokenizer() {
        this.tokenizerFactory = PTBTokenizer.factory();
    }

    public void useWhitespaceTokenizer() {
        this.tokenizerFactory = WhitespaceTokenizer.factory();
    }

    public List<Word> getWordsFromText(String fileOrURL) throws IOException {
        return this.getWordsFromText(this.fileOrURLToReader(fileOrURL));
    }

    public List<Word> getWordsFromText(Reader input) {
        Tokenizer tokenizer = this.tokenizerFactory.getTokenizer(new BufferedReader(input));
        return tokenizer.tokenize();
    }

    public List<List<? extends HasWord>> getSentencesFromText(String fileOrURL) throws IOException {
        return this.getSentencesFromText(this.fileOrURLToReader(fileOrURL));
    }

    public List<List<? extends HasWord>> getSentencesFromText(String fileOrURL, boolean doPTBEscaping, String sentenceDelimiter, int tagDelimiter) throws IOException {
        return this.getSentencesFromText(this.fileOrURLToReader(fileOrURL), doPTBEscaping, sentenceDelimiter, tagDelimiter);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader input) {
        return this.getSentencesFromText(input, false, null, -1);
    }

    public List<List<? extends HasWord>> getSentencesFromText(String input, Function<List<HasWord>, List<HasWord>> escaper, String sentenceDelimiter, int tagDelimiter) throws IOException {
        return this.getSentencesFromText(this.fileOrURLToReader(input), escaper, sentenceDelimiter, tagDelimiter);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader input, Function<List<HasWord>, List<HasWord>> escaper, String sentenceDelimiter, int tagDelimiter) {
        if (escaper == null) {
            escaper = new NullEscaper();
        }
        ListEscaper listEscaper = new ListEscaper(escaper);
        if (this.tokenizerFactory instanceof WhitespaceTokenizer.WhitespaceTokenizerFactory) {
            if (sentenceDelimiter == null) {
                WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(input, false);
                List words = tokenizer.tokenize();
                if (tagDelimiter >= 0) {
                    WordToTaggedWordProcessor wttwp = new WordToTaggedWordProcessor((char)tagDelimiter);
                    words = wttwp.process(words);
                }
                words = (List)escaper.apply(words);
                WordToSentenceProcessor sp = this.sentenceFinalPuncWords != null ? new WordToSentenceProcessor(new HashSet<String>(Arrays.asList(this.sentenceFinalPuncWords))) : new WordToSentenceProcessor();
                List list = sp.process(words);
                return list;
            }
            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(input, sentenceDelimiter.equals("\n"));
            List words = tokenizer.tokenize();
            List sentences = this.splitListsOnToken(words, sentenceDelimiter);
            if (tagDelimiter >= 0) {
                sentences = this.tagSplitSentences(sentences, tagDelimiter);
            }
            sentences = listEscaper.apply(sentences);
            return sentences;
        }
        if (tagDelimiter >= 0) {
            throw new RuntimeException("Can't read tags from untokenized document.");
        }
        if (sentenceDelimiter == null) {
            Tokenizer tokenizer = this.tokenizerFactory.getTokenizer(new BufferedReader(input));
            List words = tokenizer.tokenize();
            words = (List)escaper.apply(words);
            WordToSentenceProcessor sp = this.sentenceFinalPuncWords != null ? new WordToSentenceProcessor(new HashSet<String>(Arrays.asList(this.sentenceFinalPuncWords))) : new WordToSentenceProcessor();
            return sp.process(words);
        }
        WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(input, true);
        List tokens = tokenizer.tokenize();
        List sentences = this.glueSentences(this.splitListsOnToken(tokens, sentenceDelimiter));
        return this.tokenizeSentences(sentences);
    }

    public List<Word> getWordsFromString(String input) {
        Tokenizer tokenizer = this.tokenizerFactory.getTokenizer(new BufferedReader(new StringReader(input)));
        return tokenizer.tokenize();
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, String splitOnTag) throws IOException {
        return this.getSentencesFromXML(fileOrURL, splitOnTag, true);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, String splitOnTag, boolean doPTBEscaping) throws IOException {
        return this.getSentencesFromXML(this.fileOrURLToReader(fileOrURL), splitOnTag, doPTBEscaping);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(Reader input, String splitOnTag, boolean doPTBEscaping) {
        XMLBeginEndIterator xmlIter = new XMLBeginEndIterator(input, splitOnTag);
        ArrayList<List<? extends HasWord>> l = new ArrayList<List<? extends HasWord>>();
        while (xmlIter.hasNext()) {
            String s = (String)xmlIter.next();
            List<List<? extends HasWord>> section = this.getSentencesFromText((Reader)new BufferedReader(new StringReader(s)), doPTBEscaping, null, -1);
            for (List<? extends HasWord> individual : section) {
                l.add(individual);
            }
        }
        return l;
    }

    public List<Word> getWordsFromHTML(String fileOrURL) throws IOException {
        return this.getWordsFromHTML(this.fileOrURLToReader(fileOrURL));
    }

    public List<Word> getWordsFromHTML(Reader input) {
        HTMLParser parser = new HTMLParser();
        try {
            String s = parser.parse(input);
            return this.getWordsFromText(new StringReader(s));
        }
        catch (IOException e) {
            System.err.println("IOException" + e.getMessage());
            return null;
        }
    }

    public List<List<? extends HasWord>> getSentencesFromHTML(String fileOrURL) throws IOException {
        return this.getSentencesFromHTML(this.fileOrURLToReader(fileOrURL));
    }

    public List<List<? extends HasWord>> getSentencesFromHTML(Reader input) {
        HTMLParser parser = new HTMLParser();
        try {
            String s = parser.parse(input);
            return this.getSentencesFromText(new StringReader(s));
        }
        catch (IOException e) {
            System.err.println("IOException" + e.getMessage());
            return null;
        }
    }

    private List<List<? extends HasWord>> getSentencesFromText(Reader fileOrURL, boolean doPTBEscaping, String sentenceDelimiter, int tagDelimiter) {
        Function<List<HasWord>, List<HasWord>> escaper = null;
        escaper = doPTBEscaping ? new PTBEscapingProcessor() : new NullEscaper();
        return this.getSentencesFromText(fileOrURL, escaper, sentenceDelimiter, tagDelimiter);
    }

    private List<List<Word>> splitListsOnToken(List tokens, String sentenceDelimiter) {
        ArrayList<List<Word>> result = new ArrayList<List<Word>>();
        ArrayList<Word> sentence = new ArrayList<Word>();
        int sz = tokens.size();
        for (int i = 0; i < sz; ++i) {
            Word word = (Word)tokens.get(i);
            if (word.word().equals(sentenceDelimiter)) {
                result.add(sentence);
                sentence = new ArrayList();
                continue;
            }
            sentence.add(word);
        }
        if (!sentence.isEmpty()) {
            result.add(sentence);
        }
        return result;
    }

    private List glueSentences(List sentences) {
        ArrayList<String> result = new ArrayList<String>();
        int sz = sentences.size();
        for (int i = 0; i < sz; ++i) {
            List sentence = (List)sentences.get(i);
            result.add(this.glueSentence(sentence));
        }
        return result;
    }

    private String glueSentence(List sentence) {
        StringBuilder result = new StringBuilder();
        Word word = (Word)sentence.get(0);
        String s = word.word();
        result.append(s);
        int sz = sentence.size();
        for (int i = 1; i < sz; ++i) {
            word = (Word)sentence.get(i);
            s = word.word();
            result.append(" ").append(s);
        }
        return result.toString();
    }

    private List tokenizeSentences(List sentences) {
        ArrayList result = new ArrayList();
        for (int i = 0; i < sentences.size(); ++i) {
            String sentence = (String)sentences.get(i);
            Tokenizer tok = this.tokenizerFactory.getTokenizer(new StringReader(sentence));
            result.add(tok.tokenize());
        }
        return result;
    }

    private List tagSplitSentences(List sentences, int tagDelimiter) {
        ArrayList<List> result = new ArrayList<List>();
        WordToTaggedWordProcessor wttwp = new WordToTaggedWordProcessor((char)tagDelimiter);
        for (int i = 0; i < sentences.size(); ++i) {
            List sentence = (List)sentences.get(i);
            sentence = wttwp.process(sentence);
            result.add(sentence);
        }
        return result;
    }

    private Reader fileOrURLToReader(String fileOrURL) throws IOException {
        Matcher m = this.urlPattern.matcher(fileOrURL);
        if (m.matches()) {
            URL url = new URL(fileOrURL);
            return new BufferedReader(new StringReader(StringUtils.slurpURL(url)));
        }
        if (this.encoding == null) {
            return new FileReader(fileOrURL);
        }
        return new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(fileOrURL), this.encoding));
    }

    private List tokenizeListOfStrings(List stringList) {
        ArrayList docList = new ArrayList();
        for (int i = 0; i < stringList.size(); ++i) {
            BufferedReader r = new BufferedReader(new StringReader((String)stringList.get(i)));
            Tokenizer tok = this.tokenizerFactory.getTokenizer(r);
            docList.add(tok.tokenize());
        }
        return docList;
    }

    public static void main(String[] args) throws IOException {
        if (args.length == 0) {
            System.err.println("usage: DocumentPreprocessor -file filename [-xml tag|-html] [-noSplitSentence]");
            return;
        }
        boolean splitSentences = true;
        String xmlTag = null;
        DocumentPreprocessor docPreprocessor = new DocumentPreprocessor();
        int fileType = 0;
        String file = null;
        for (int i = 0; i < args.length; ++i) {
            if (args[i].equals("-file")) {
                file = args[++i];
                continue;
            }
            if (args[i].equals("-xml")) {
                fileType = 1;
                xmlTag = args[++i];
                continue;
            }
            if (args[i].equals("-html")) {
                fileType = 2;
                continue;
            }
            if (!args[i].equals("-noSplitSentence")) continue;
            splitSentences = false;
        }
        List<Object> docs = null;
        switch (fileType) {
            case 0: {
                if (splitSentences) {
                    docs = docPreprocessor.getSentencesFromText(file);
                    break;
                }
                docs = docPreprocessor.getWordsFromText(file);
                break;
            }
            case 1: {
                docs = docPreprocessor.getSentencesFromXML(file, xmlTag);
                break;
            }
            case 2: {
                docs = splitSentences ? docPreprocessor.getSentencesFromHTML(file) : docPreprocessor.getWordsFromHTML(file);
            }
        }
        System.err.println(docs.size());
        for (List<? extends HasWord> l : docs) {
            System.err.println(l);
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class ListEscaper
    implements Function<List<List<HasWord>>, List<List<HasWord>>> {
        Function<List<HasWord>, List<HasWord>> f;

        public ListEscaper(Function<List<HasWord>, List<HasWord>> f) {
            this.f = f;
        }

        @Override
        public List<List<HasWord>> apply(List<List<HasWord>> lists) {
            ArrayList<List<HasWord>> result = new ArrayList<List<HasWord>>(lists.size());
            for (List<HasWord> l : lists) {
                result.add(this.f.apply(l));
            }
            return result;
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class NullEscaper
    implements Function<List<HasWord>, List<HasWord>> {
        private NullEscaper() {
        }

        @Override
        public List<HasWord> apply(List<HasWord> hasWords) {
            return hasWords;
        }
    }
}

