package edu.cmu.meteor.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/cmu/meteor/util/Normalizer.class */
public class Normalizer {
    private static String s_space = " ";
    private static String alpha = "A-Za-zŠŽšžŸÀ-ÖØ-öø-žЀ-ӿԀ-ԧꙀ-ꙮ꙾-ꚗᴀ-ᵿ";
    private static String alnum = "0-9A-Za-zŠŽšžŸÀ-ÖØ-öø-žЀ-ӿԀ-ԧꙀ-ꙮ꙾-ꚗᴀ-ᵿ";
    private static Pattern r_sep_other = Pattern.compile("([^" + alnum + "\\s\\.\\'\\`\\,\\-\\‘\\’])");
    private static String s_sep_other = " $1 ";
    private static Pattern r_multi_dot = Pattern.compile("\\.([\\.]+)");
    private static String s_multi_dot = " DOTMULTI$1";
    private static String s_multi_dot2 = "DOTMULTI.";
    private static Pattern r_multi_dot2 = Pattern.compile("DOTMULTI\\.([^\\.])");
    private static String s_multi_dot3 = "DOTDOTMULTI $1";
    private static String s_multi_dot4 = "DOTDOTMULTI";
    private static String s_multi_dot5 = "DOTMULTI";
    private static String s_multi_dot6 = ".";
    private static Pattern r_comma = Pattern.compile("([^\\p{Digit}])[,]([^\\p{Digit}])");
    private static String s_comma = "$1 , $2";
    private static Pattern r_comma2 = Pattern.compile("([\\p{Digit}])[,]([^\\p{Digit}])");
    private static Pattern r_comma3 = Pattern.compile("([^\\p{Digit}])[,]([\\p{Digit}])");
    private static Pattern r_quote_norm = Pattern.compile("([`‘’])");
    private static String s_quote_norm = "'";
    private static Pattern r_quote_norm2 = Pattern.compile("([“”]|'')");
    private static String s_quote_norm2 = " \" ";
    private static String s_dash_norm = "–";
    private static String s_dash_norm2 = "-";
    private static String s_dash_norm3 = "--";
    private static Pattern r_cont_en = Pattern.compile("([^" + alpha + "])[']([^" + alpha + "])");
    private static String s_cont_en = "$1 ' $2";
    private static Pattern r_cont_en2 = Pattern.compile("([^" + alpha + "\\p{Digit}])[']([" + alpha + "])");
    private static Pattern r_cont_en3 = Pattern.compile("([" + alpha + "])[']([^" + alpha + "])");
    private static Pattern r_cont_en4 = Pattern.compile("([" + alpha + "])[']([" + alpha + "])");
    private static String s_cont_en2 = "$1 '$2";
    private static Pattern r_cont_en5 = Pattern.compile("([\\p{Digit}])[']([s])");
    private static Pattern r_cont_fr = Pattern.compile("([^" + alpha + "])[']([^" + alpha + "])");
    private static String s_cont_fr = "$1 ' $2";
    private static Pattern r_cont_fr2 = Pattern.compile("([^" + alpha + "])[']([" + alpha + "])");
    private static Pattern r_cont_fr3 = Pattern.compile("([" + alpha + "])[']([^" + alpha + "])");
    private static Pattern r_cont_fr4 = Pattern.compile("([" + alpha + "])[']([" + alpha + "])");
    private static String s_cont_fr2 = "$1' $2";
    private static String s_cont_other1 = "'";
    private static String s_cont_other2 = " ' ";
    private static Pattern r_punct_strip = Pattern.compile("[^" + alnum + "]");
    private static String s_punct_strip = " ";
    private static Pattern r_rm_dash = Pattern.compile("([" + alnum + "\\.])[\\-]([" + alnum + "])");
    private static String s_rm_dash = "$1 $2";
    private static Pattern r_white = Pattern.compile("[  \u2000\u2001\u2002\u2003\u2004\u2005\u2006 \u2008\u2009\u200a \u205f\u3000 ]+");
    private static String s_white = " ";
    private static Hashtable<String, Integer> nbpDict = null;
    private static int nbpLangID = 99;
    private static String s_nbp = ".";
    private static String s_nbp2 = "";
    private static Pattern r_nbp1 = Pattern.compile("[" + alpha + "]");
    private static Pattern r_nbp2 = Pattern.compile("^[\\p{Lower}]");
    private static Pattern r_nbp3 = Pattern.compile("^[0-9]+");
    private static String s_nbp3 = " .";
    private static String s_nbp4 = " ";
    private static Pattern r_punct_nonwest = Pattern.compile("([\\!-\\+\\-\\/\\:-\\@\\[-\\`\\{-¿،])");
    private static String s_punct_nonwest = " $1 ";
    private static String s_punct_nonwest2 = " ";
    private static String s_punct_nonwest3 = ".";
    private static Pattern r_dot_nonwest = Pattern.compile("([^\\p{Digit}])[\\.]([^\\p{Digit}])");
    private static String s_dot_nonwest = "$1 . $2";
    private static Pattern r_dot_nonwest2 = Pattern.compile("([\\p{Digit}])[\\.]([^\\p{Digit}])");
    private static Pattern r_dot_nonwest3 = Pattern.compile("([^\\p{Digit}])[\\.]([\\p{Digit}])");
    private static Pattern r_quot = Pattern.compile("&quot;", 2);
    private static Pattern r_apos = Pattern.compile("&apos;", 2);
    private static Pattern r_lt = Pattern.compile("&lt;", 2);
    private static Pattern r_gt = Pattern.compile("&gt;", 2);
    private static Pattern r_amp = Pattern.compile("&amp;", 2);
    private static String quot = "\"";
    private static String apos = "'";
    private static String lt = "<";
    private static String gt = ">";
    private static String amp = "&";

    private static Hashtable<String, Integer> nbpList(int i) {
        if (nbpDict != null && nbpLangID == i) {
            return nbpDict;
        }
        nbpDict = new Hashtable<>();
        nbpLangID = 99;
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new URL(Constants.DEFAULT_NBP_DIR_URL.toString() + "/" + Constants.getLanguageName(i) + ".prefixes").openStream(), "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                StringTokenizer stringTokenizer = new StringTokenizer(readLine);
                if (stringTokenizer.countTokens() != 0) {
                    String nextToken = stringTokenizer.nextToken();
                    if (!nextToken.startsWith("#")) {
                        int i2 = 1;
                        if (stringTokenizer.hasMoreTokens() && stringTokenizer.nextToken().equals("#NUMERIC_ONLY#")) {
                            i2 = 2;
                        }
                        nbpDict.put(nextToken, Integer.valueOf(i2));
                    }
                }
            }
            bufferedReader.close();
            nbpLangID = i;
        } catch (Exception e) {
            System.err.println("Error: Nonbreaking prefix list could not be loaded:");
            e.printStackTrace();
        }
        return nbpDict;
    }

    public static String normalizeLine(String str, int i, boolean z) {
        String str2;
        String replace;
        String str3;
        if (!Constants.isSupported(i)) {
            System.err.println("Error: Pre-process the input files and run Meteor without the -norm option.");
            String str4 = "";
            try {
                str4 = Constants.getLanguageName(i);
            } catch (Exception e) {
            }
            throw new RuntimeException("No normalizer for language (" + str4 + ")");
        }
        if (i == 5 || i == 99) {
            return normalizeNonWestern(str, Boolean.valueOf(z));
        }
        String replaceAll = r_multi_dot.matcher(r_sep_other.matcher(s_space + str + s_space).replaceAll(s_sep_other)).replaceAll(s_multi_dot);
        while (true) {
            str2 = replaceAll;
            if (!str2.contains(s_multi_dot2)) {
                break;
            }
            replaceAll = r_multi_dot2.matcher(str2).replaceAll(s_multi_dot3).replace(s_multi_dot2, s_multi_dot4);
        }
        String replaceAll2 = r_rm_dash.matcher(r_quote_norm2.matcher(r_quote_norm.matcher(r_comma3.matcher(r_comma2.matcher(r_comma.matcher(str2).replaceAll(s_comma)).replaceAll(s_comma)).replaceAll(s_comma)).replaceAll(s_quote_norm)).replaceAll(s_quote_norm2).replace(s_dash_norm, s_dash_norm2).replace(s_dash_norm3, s_dash_norm2)).replaceAll(s_rm_dash);
        if (i == 0) {
            replace = r_cont_en5.matcher(r_cont_en4.matcher(r_cont_en3.matcher(r_cont_en2.matcher(r_cont_en.matcher(replaceAll2).replaceAll(s_cont_en)).replaceAll(s_cont_en)).replaceAll(s_cont_en)).replaceAll(s_cont_en2)).replaceAll(s_cont_en2);
        } else if (i == 2) {
            replace = r_cont_fr4.matcher(r_cont_fr3.matcher(r_cont_fr2.matcher(r_cont_fr.matcher(replaceAll2).replaceAll(s_cont_fr)).replaceAll(s_cont_fr)).replaceAll(s_cont_fr)).replaceAll(s_cont_fr2);
        } else {
            replace = replaceAll2.replace(s_cont_other1, s_cont_other2);
        }
        StringTokenizer stringTokenizer = new StringTokenizer(replace);
        String[] strArr = new String[stringTokenizer.countTokens()];
        int i2 = 0;
        while (i2 < strArr.length) {
            int i3 = i2;
            i2++;
            strArr[i3] = stringTokenizer.nextToken();
        }
        StringBuilder sb = new StringBuilder();
        Hashtable<String, Integer> nbpList = nbpList(i);
        for (int i4 = 0; i4 < strArr.length; i4++) {
            if (strArr[i4].length() <= 1 || !strArr[i4].endsWith(s_nbp)) {
                sb.append(strArr[i4]);
            } else {
                String substring = strArr[i4].substring(0, strArr[i4].length() - 1);
                Integer num = nbpList.get(substring);
                if (substring.contains(s_nbp) && r_nbp1.matcher(substring).find()) {
                    sb.append(strArr[i4].replace(s_nbp, s_nbp2));
                } else if ((num != null && num.intValue() == 1) || (i4 < strArr.length - 1 && r_nbp2.matcher(strArr[i4 + 1]).find())) {
                    sb.append(strArr[i4]);
                } else if (num == null || num.intValue() != 2 || i4 >= strArr.length - 1 || !r_nbp3.matcher(strArr[i4 + 1]).find()) {
                    sb.append(substring);
                    sb.append(s_nbp3);
                } else {
                    sb.append(strArr[i4]);
                }
            }
            sb.append(s_nbp4);
        }
        String sb2 = sb.toString();
        while (true) {
            str3 = sb2;
            if (!str3.contains(s_multi_dot4)) {
                break;
            }
            sb2 = str3.replace(s_multi_dot4, s_multi_dot2);
        }
        String replace2 = str3.replace(s_multi_dot5, s_multi_dot6);
        if (!z) {
            replace2 = r_punct_strip.matcher(replace2).replaceAll(s_punct_strip);
        }
        return r_white.matcher(replace2).replaceAll(s_white).trim();
    }

    private static String normalizeNonWestern(String str, Boolean bool) {
        String str2;
        String replaceAll = r_multi_dot.matcher(s_space + str + s_space).replaceAll(s_multi_dot);
        while (true) {
            str2 = replaceAll;
            if (!str2.contains(s_multi_dot2)) {
                break;
            }
            replaceAll = r_multi_dot2.matcher(str2).replaceAll(s_multi_dot3).replace(s_multi_dot2, s_multi_dot4);
        }
        String replaceAll2 = r_punct_nonwest.matcher(r_dot_nonwest3.matcher(r_dot_nonwest2.matcher(r_dot_nonwest.matcher(r_quote_norm2.matcher(r_quote_norm.matcher(r_comma3.matcher(r_comma2.matcher(r_comma.matcher(str2).replaceAll(s_comma)).replaceAll(s_comma)).replaceAll(s_comma)).replaceAll(s_quote_norm)).replaceAll(s_quote_norm2).replace(s_dash_norm, s_dash_norm2).replace(s_dash_norm3, s_dash_norm2)).replaceAll(s_dot_nonwest)).replaceAll(s_dot_nonwest)).replaceAll(s_dot_nonwest)).replaceAll(s_punct_nonwest);
        if (!bool.booleanValue()) {
            replaceAll2 = r_punct_nonwest.matcher(replaceAll2).replaceAll(s_punct_nonwest2).replace(s_punct_nonwest3, s_punct_nonwest2);
        }
        while (replaceAll2.contains(s_multi_dot4)) {
            replaceAll2 = replaceAll2.replace(s_multi_dot4, s_multi_dot2);
        }
        return r_white.matcher(replaceAll2.replace(s_multi_dot5, s_multi_dot6)).replaceAll(s_white).trim();
    }

    public static String unescapeSGML(String str) {
        return r_amp.matcher(r_gt.matcher(r_lt.matcher(r_apos.matcher(r_quot.matcher(str).replaceAll(quot)).replaceAll(apos)).replaceAll(lt)).replaceAll(gt)).replaceAll(amp);
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length < 2) {
            System.out.println("Usage: Normalizer lang punct");
            System.out.println("where puct is true/false");
            return;
        }
        int languageID = Constants.getLanguageID(Constants.normLanguageName(strArr[0]));
        boolean parseBoolean = Boolean.parseBoolean(strArr[1]);
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return;
            } else {
                System.out.println(normalizeLine(readLine, languageID, parseBoolean));
            }
        }
    }
}
