package org.languagetool.tokenizers.cs;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.SentenceTokenizer;

/* loaded from: input_file:org/languagetool/tokenizers/cs/CzechSentenceTokenizer.class */
public class CzechSentenceTokenizer extends SentenceTokenizer {
    private static final String EOS = "��";
    private static final String P = "[\\.!?…]";
    private static final String AP = "(?:'|«|\"|”|\\)|\\]|\\})?";
    private static final String PAP = "[\\.!?…](?:'|«|\"|”|\\)|\\]|\\})?";
    private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("(\\n\\s*\\n)");
    private static final Pattern paragraphByLineBreak = Pattern.compile("(\\n)");
    private static final Pattern punctWhitespace = Pattern.compile("([\\.!?…](?:'|«|\"|”|\\)|\\]|\\})?(\u0002)?\\s)");
    private static final Pattern punctUpperLower = Pattern.compile("([\\.!?…](?:'|«|\"|”|\\)|\\]|\\})?)([\\p{Lu}][^\\p{Lu}.])");
    private static final Pattern letterPunct = Pattern.compile("(\\s[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\.!?…])");
    private static final Pattern abbrev1 = Pattern.compile("([^-\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\.!?…](?:'|«|\"|”|\\)|\\]|\\})?\\s)��");
    private static final Pattern abbrev2 = Pattern.compile("([^-\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\.!?…])��");
    private static final Pattern abbrev3 = Pattern.compile("(\\s[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]\\.\\s+)��");
    private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )��([\\p{Ll}])");
    private static final Pattern abbrev5 = Pattern.compile("(['\"][\\.!?…]['\"]\\s+)��");
    private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)��(\\s*[\\p{Ll}])");
    private static final Pattern abbrev7 = Pattern.compile("(\\s[\\.!?…](?:'|«|\"|”|\\)|\\]|\\})?\\s)��");
    private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)��");
    private static final Pattern repair1 = Pattern.compile("('[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\.!?…])(\\s)");
    private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)");
    private static final String TITLES = "Bc|BcA|Ing|Ing.arch|MUDr|MVDr|MgA|Mgr|JUDr|PhDr|RNDr|PharmDr|ThLic|ThDr|Ph.D|Th.D|prof|doc|CSc|DrSc|dr. h. c|PaedDr|Dr|PhMr|DiS";
    private static final String ABBREVIATIONS = "abt|ad|a.i|aj|angl|anon|apod|atd|atp|aut|bd|biogr|b.m|b.p|b.r|cca|cit|cizojaz|c.k|col|čes|čín|čj|ed|facs|fasc|fol|fot|franc|h.c|hist|hl|hrsg|ibid|il|ind|inv.č|jap|jhdt|jv|koed|kol|korej|kl|krit|lat|lit|m.a|maď|mj|mp|násl|např|nepubl|něm|no|nr|n.s|okr|odd|odp|obr|opr|orig|phil|pl|pokrač|pol|port|pozn|př.kr|př.n.l|přel|přeprac|příl|pseud|pt|red|repr|resp|revid|rkp|roč|roz|rozš|samost|sect|sest|seš|sign|sl|srv|stol|sv|šk|šk.ro|špan|tab|t.č|tis|tj|tř|tzv|univ|uspoř|vol|vl.jm|vs|vyd|vyobr|zal|zejm|zkr|zprac|zvl|n.p|Bc|BcA|Ing|Ing.arch|MUDr|MVDr|MgA|Mgr|JUDr|PhDr|RNDr|PharmDr|ThLic|ThDr|Ph.D|Th.D|prof|doc|CSc|DrSc|dr. h. c|PaedDr|Dr|PhMr|DiS";
    private Pattern paragraph;

    public CzechSentenceTokenizer() {
        setSingleLineBreaksMarksParagraph(false);
    }

    @Override // org.languagetool.tokenizers.SentenceTokenizer
    public final void setSingleLineBreaksMarksParagraph(boolean z) {
        if (z) {
            this.paragraph = paragraphByLineBreak;
        } else {
            this.paragraph = paragraphByTwoLineBreaks;
        }
    }

    @Override // org.languagetool.tokenizers.SentenceTokenizer, org.languagetool.tokenizers.Tokenizer
    public final List<String> tokenize(String str) {
        StringTokenizer stringTokenizer = new StringTokenizer(splitUnsplitStuff(removeFalseEndOfSentence(firstSentenceSplitting(str))), EOS);
        ArrayList arrayList = new ArrayList();
        while (stringTokenizer.hasMoreTokens()) {
            arrayList.add(stringTokenizer.nextToken());
        }
        return arrayList;
    }

    private String firstSentenceSplitting(String str) {
        return letterPunct.matcher(punctUpperLower.matcher(punctWhitespace.matcher(this.paragraph.matcher(str).replaceAll("$1��")).replaceAll("$1��")).replaceAll("$1��$2")).replaceAll("$1��");
    }

    @Override // org.languagetool.tokenizers.SentenceTokenizer
    protected String removeFalseEndOfSentence(String str) {
        return abbrev8.matcher(abbrev7.matcher(abbrev6.matcher(Pattern.compile("(?u)(\\b(abt|ad|a.i|aj|angl|anon|apod|atd|atp|aut|bd|biogr|b.m|b.p|b.r|cca|cit|cizojaz|c.k|col|čes|čín|čj|ed|facs|fasc|fol|fot|franc|h.c|hist|hl|hrsg|ibid|il|ind|inv.č|jap|jhdt|jv|koed|kol|korej|kl|krit|lat|lit|m.a|maď|mj|mp|násl|např|nepubl|něm|no|nr|n.s|okr|odd|odp|obr|opr|orig|phil|pl|pokrač|pol|port|pozn|př.kr|př.n.l|přel|přeprac|příl|pseud|pt|red|repr|resp|revid|rkp|roč|roz|rozš|samost|sect|sest|seš|sign|sl|srv|stol|sv|šk|šk.ro|špan|tab|t.č|tis|tj|tř|tzv|univ|uspoř|vol|vl.jm|vs|vyd|vyobr|zal|zejm|zkr|zprac|zvl|n.p|Bc|BcA|Ing|Ing.arch|MUDr|MVDr|MgA|Mgr|JUDr|PhDr|RNDr|PharmDr|ThLic|ThDr|Ph.D|Th.D|prof|doc|CSc|DrSc|dr. h. c|PaedDr|Dr|PhMr|DiS)[\\.!?…](?:'|«|\"|”|\\)|\\]|\\})?\\s)��").matcher(abbrev5.matcher(abbrev4.matcher(abbrev3.matcher(abbrev2.matcher(abbrev1.matcher(str).replaceAll("$1")).replaceAll("$1")).replaceAll("$1")).replaceAll("$1$2")).replaceAll("$1")).replaceAll("$1")).replaceAll("$1$2")).replaceAll("$1")).replaceAll("$1").replaceAll("(\\d+\\.) ��([\\p{L}&&[^\\p{Lu}]]+)", "$1 $2").replaceAll("\\(([!?]+)\\) ��", "($1) ");
    }

    private String splitUnsplitStuff(String str) {
        return repair2.matcher(repair1.matcher(str).replaceAll("$1��$2")).replaceAll("$1��$2");
    }
}
