package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

/* loaded from: input_file:org/languagetool/tokenizers/SentenceTokenizer.class */
public class SentenceTokenizer implements Tokenizer {
    protected static final String EOS = "��";
    protected static final String P = "[\\.!?]";
    protected static final String AP = "(?:'|«|\"||\\)|\\]|\\})?";
    protected static final String PAP = "[\\.!?](?:'|«|\"||\\)|\\]|\\})?";
    protected static final String PARENS = "[\\(\\)\\[\\]]";
    private Pattern paragraph;
    private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("([\\n\\r]\\s*[\\n\\r])");
    private static final Pattern paragraphByLineBreak = Pattern.compile("([\\n\\r])");
    private static final Pattern punctWhitespace = Pattern.compile("([\\.!?](?:'|«|\"||\\)|\\]|\\})?(\u0002)?\\s)");
    private static final Pattern punctUpperLower = Pattern.compile("([\\.!?](?:'|«|\"||\\)|\\]|\\})?)([\\p{Lu}][^\\p{Lu}.])");
    private static final Pattern letterPunct = Pattern.compile("(\\s[\\wüöäÜÖÄß][\\.!?])");
    private static final Pattern abbrev1 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß][\\.!?](?:'|«|\"||\\)|\\]|\\})?\\s)��");
    private static final Pattern abbrev2 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß][\\.!?])��");
    private static final Pattern abbrev3 = Pattern.compile("(\\s[\\wüöäÜÖÄß]\\.\\s+)��");
    private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )��([\\p{Ll}])");
    private static final Pattern abbrev5 = Pattern.compile("(['\"][\\.!?]['\"]\\s+)��");
    private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)��(\\s*[\\p{Ll}])");
    private static final Pattern abbrev7 = Pattern.compile("(\\s[\\.!?](?:'|«|\"||\\)|\\]|\\})?\\s)��");
    private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)��");
    private static final Pattern repair1 = Pattern.compile("('[\\wüöäÜÖÄß][\\.!?])(\\s)");
    private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)");
    private static final Pattern repair3 = Pattern.compile("([ap]\\.m\\.\\s+)([\\p{Lu}])");
    private static final Pattern repair10 = Pattern.compile("([\\(\\[])([!?]+)([\\]\\)]) ��");
    private static final Pattern repair11 = Pattern.compile("([!?]+)([\\)\\]]) ��");
    private static final Pattern repair12 = Pattern.compile("([\\(\\)\\[\\]]) ��");
    private static final String[] ABBREV_LIST = {"Mr", "Mrs", "No", "pp", "St", "no", "Sr", "Jr", "Bros", "etc", "vs", "esp", "Fig", "fig", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Sept", "Oct", "Okt", "Nov", "Dec", "Ph.D", "PhD", "al", "cf", "Inc", "Ms", "Gen", "Sen", "Prof", "Corp", "Co"};
    private final Set<Pattern> abbreviationPatterns;
    protected String[] monthNames;

    public SentenceTokenizer() {
        this(new String[0]);
    }

    public SentenceTokenizer(String[] strArr) {
        this.abbreviationPatterns = new HashSet();
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(Arrays.asList(strArr));
        arrayList.addAll(Arrays.asList(ABBREV_LIST));
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            this.abbreviationPatterns.add(Pattern.compile("(\\b" + ((String) it.next()) + PAP + "\\s)" + EOS));
        }
        setSingleLineBreaksMarksParagraph(false);
    }

    public void setSingleLineBreaksMarksParagraph(boolean z) {
        if (z) {
            this.paragraph = paragraphByLineBreak;
        } else {
            this.paragraph = paragraphByTwoLineBreaks;
        }
    }

    public boolean singleLineBreaksMarksPara() {
        return this.paragraph == paragraphByLineBreak;
    }

    @Override // org.languagetool.tokenizers.Tokenizer
    public List<String> tokenize(String str) {
        StringTokenizer stringTokenizer = new StringTokenizer(splitUnsplitStuff(removeFalseEndOfSentence(firstSentenceSplitting(str))), EOS);
        ArrayList arrayList = new ArrayList();
        while (stringTokenizer.hasMoreTokens()) {
            arrayList.add(stringTokenizer.nextToken());
        }
        return arrayList;
    }

    private String firstSentenceSplitting(String str) {
        return letterPunct.matcher(punctUpperLower.matcher(punctWhitespace.matcher(this.paragraph.matcher(str).replaceAll("$1��")).replaceAll("$1��")).replaceAll("$1��$2")).replaceAll("$1��");
    }

    protected String removeFalseEndOfSentence(String str) {
        String replaceAll = abbrev5.matcher(abbrev4.matcher(abbrev3.matcher(abbrev2.matcher(abbrev1.matcher(str).replaceAll("$1")).replaceAll("$1")).replaceAll("$1")).replaceAll("$1$2")).replaceAll("$1");
        Iterator<Pattern> it = this.abbreviationPatterns.iterator();
        while (it.hasNext()) {
            replaceAll = it.next().matcher(replaceAll).replaceAll("$1");
        }
        String replaceAll2 = abbrev8.matcher(abbrev7.matcher(abbrev6.matcher(replaceAll).replaceAll("$1$2")).replaceAll("$1")).replaceAll("$1");
        if (this.monthNames != null) {
            for (String str2 : this.monthNames) {
                replaceAll2 = replaceAll2.replaceAll("(\\d+\\.) ��(" + str2 + ")", "$1 $2");
            }
        }
        return repair12.matcher(repair11.matcher(repair10.matcher(replaceAll2).replaceAll("$1$2$3 ")).replaceAll("$1$2 ")).replaceAll("$1 ");
    }

    private String splitUnsplitStuff(String str) {
        return repair3.matcher(repair2.matcher(repair1.matcher(str).replaceAll("$1��$2")).replaceAll("$1��$2")).replaceAll("$1��$2");
    }
}
