package com.alibaba.alink.operator.common.nlp.bert.tokenizer;

import com.alibaba.alink.common.exceptions.AkUnclassifiedErrorException;
import com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
import org.apache.commons.io.FileUtils;

/* loaded from: input_file:com/alibaba/alink/operator/common/nlp/bert/tokenizer/BertTokenizerImpl.class */
public class BertTokenizerImpl extends PreTrainedTokenizer {
    Map<String, Integer> vocab;
    Map<Integer, String> idsToTokens;
    boolean doBasicTokenize;
    BasicTokenizer basicTokenizer;
    WordpieceTokenizer wordpieceTokenizer;

    /* loaded from: input_file:com/alibaba/alink/operator/common/nlp/bert/tokenizer/BertTokenizerImpl$BasicTokenizer.class */
    public static class BasicTokenizer {
        Set<String> neverSplit;
        boolean doLowerCase;
        boolean tokenizeChineseChars;
        Boolean stripAccents;

        public BasicTokenizer(boolean z, Set<String> set, boolean z2, Boolean bool) {
            this.neverSplit = null == set ? Collections.emptySet() : set;
            this.doLowerCase = z;
            this.tokenizeChineseChars = z2;
            this.stripAccents = bool;
        }

        public BasicTokenizer() {
            this(true, null, true, null);
        }

        public String[] tokenize(String str, Set<String> set) {
            HashSet hashSet = null != set ? new HashSet(set) : new HashSet();
            hashSet.addAll(this.neverSplit);
            String cleanText = cleanText(str);
            if (this.tokenizeChineseChars) {
                cleanText = tokenizeChineseChars(cleanText);
            }
            String[] whitespaceTokenize = TokenizationUtils.whitespaceTokenize(cleanText);
            ArrayList arrayList = new ArrayList();
            for (String str2 : whitespaceTokenize) {
                if (!hashSet.contains(str2)) {
                    if (this.doLowerCase) {
                        str2 = str2.toLowerCase(Locale.ROOT);
                        if (null == this.stripAccents || this.stripAccents.booleanValue()) {
                            str2 = runStripAccents(str2);
                        }
                    } else if (null != this.stripAccents && this.stripAccents.booleanValue()) {
                        str2 = runStripAccents(str2);
                    }
                }
                arrayList.addAll(runSplitOnPunc(str2, hashSet));
            }
            return TokenizationUtils.whitespaceTokenize(String.join(" ", arrayList));
        }

        public String[] tokenize(String str) {
            return tokenize(str, null);
        }

        static String runStripAccents(String str) {
            int[] array = Normalizer.normalize(str, Normalizer.Form.NFD).codePoints().filter(i -> {
                return Character.getType(i) != 6;
            }).toArray();
            return new String(array, 0, array.length);
        }

        static List<String> runSplitOnPunc(String str, Set<String> set) {
            if (set.contains(str)) {
                return Collections.singletonList(str);
            }
            int[] array = str.codePoints().toArray();
            boolean z = true;
            ArrayList arrayList = new ArrayList();
            for (int i : array) {
                if (TokenizationUtils.isPunctuation(i)) {
                    arrayList.add(new String(new int[]{i}, 0, 1));
                    z = true;
                } else {
                    if (z) {
                        arrayList.add("");
                    }
                    z = false;
                    arrayList.set(arrayList.size() - 1, ((String) arrayList.get(arrayList.size() - 1)) + new String(new int[]{i}, 0, 1));
                }
            }
            return arrayList;
        }

        static String cleanText(String str) {
            int[] array = str.codePoints().filter(i -> {
                return (i == 0 || i == 65533 || TokenizationUtils.isControl(i)) ? false : true;
            }).map(i2 -> {
                if (TokenizationUtils.isWhitespace(i2)) {
                    return 32;
                }
                return i2;
            }).toArray();
            return new String(array, 0, array.length);
        }

        public static String tokenizeChineseChars(String str) {
            int[] array = str.codePoints().flatMap(i -> {
                return isChineseChar(i) ? IntStream.of(32, i, 32) : IntStream.of(i);
            }).toArray();
            return new String(array, 0, array.length);
        }

        public static boolean isChineseChar(int i) {
            return (i >= 19968 && i <= 40959) || (i >= 13312 && i <= 19903) || ((i >= 131072 && i <= 173791) || ((i >= 173824 && i <= 177983) || ((i >= 177984 && i <= 178207) || ((i >= 178208 && i <= 183983) || ((i >= 63744 && i <= 64255) || (i >= 194560 && i <= 195103))))));
        }
    }

    /* loaded from: input_file:com/alibaba/alink/operator/common/nlp/bert/tokenizer/BertTokenizerImpl$WordpieceTokenizer.class */
    public static class WordpieceTokenizer {
        Map<String, Integer> vocab;
        String unkToken;
        int maxInputCharsPerWord;

        public WordpieceTokenizer(Map<String, Integer> map, String str, int i) {
            this.vocab = map;
            this.unkToken = str;
            this.maxInputCharsPerWord = i;
        }

        public WordpieceTokenizer(Map<String, Integer> map, String str) {
            this(map, str, 100);
        }

        public String[] tokenizer(String str) {
            ArrayList arrayList = new ArrayList();
            for (String str2 : TokenizationUtils.whitespaceTokenize(str)) {
                int[] array = str2.codePoints().toArray();
                if (array.length > this.maxInputCharsPerWord) {
                    arrayList.add(this.unkToken);
                } else {
                    boolean z = false;
                    int i = 0;
                    ArrayList arrayList2 = new ArrayList();
                    while (true) {
                        if (i >= array.length) {
                            break;
                        }
                        int length = array.length;
                        String str3 = null;
                        while (true) {
                            if (i >= length) {
                                break;
                            }
                            String str4 = new String(array, i, length - i);
                            if (i > 0) {
                                str4 = "##" + str4;
                            }
                            if (this.vocab.containsKey(str4)) {
                                str3 = str4;
                                break;
                            }
                            length--;
                        }
                        if (null == str3) {
                            z = true;
                            break;
                        }
                        arrayList2.add(str3);
                        i = length;
                    }
                    if (z) {
                        arrayList.add(this.unkToken);
                    } else {
                        arrayList.addAll(arrayList2);
                    }
                }
            }
            return (String[]) arrayList.toArray(new String[0]);
        }
    }

    public static BertTokenizerImpl fromPretrained(String str) {
        return fromPretrained(str, Kwargs.empty());
    }

    public static BertTokenizerImpl fromPretrained(String str, Kwargs kwargs) {
        Kwargs clone = kwargs.clone();
        clone.put("pretrained_model_name_or_path", str);
        return (BertTokenizerImpl) PreTrainedTokenizer.fromPretrained(BertTokenizerImpl.class, clone);
    }

    public BertTokenizerImpl(Kwargs kwargs) {
        this.config = kwargs;
        File file = new File((String) kwargs.get("vocab_file"));
        boolean booleanValue = ((Boolean) kwargs.getOrDefault("do_basic_tokenize", true)).booleanValue();
        boolean booleanValue2 = ((Boolean) kwargs.getOrDefault("do_lower_case", true)).booleanValue();
        Set set = (Set) kwargs.get("never_split");
        boolean booleanValue3 = ((Boolean) kwargs.get("tokenize_chinese_chars")).booleanValue();
        Boolean bool = (Boolean) kwargs.get("strip_accents");
        String str = (String) kwargs.get("unk_token");
        this.vocab = loadVocab(file);
        this.idsToTokens = new LinkedHashMap();
        this.vocab.forEach((str2, num) -> {
            this.idsToTokens.put(num, str2);
        });
        this.doBasicTokenize = booleanValue;
        if (booleanValue) {
            this.basicTokenizer = new BasicTokenizer(booleanValue2, set, booleanValue3, bool);
        }
        this.wordpieceTokenizer = new WordpieceTokenizer(this.vocab, str);
    }

    static LinkedHashMap<String, Integer> loadVocab(File file) {
        try {
            List readLines = FileUtils.readLines(file, StandardCharsets.UTF_8);
            int i = 0;
            LinkedHashMap<String, Integer> linkedHashMap = new LinkedHashMap<>();
            Iterator it = readLines.iterator();
            while (it.hasNext()) {
                linkedHashMap.put((String) it.next(), Integer.valueOf(i));
                i++;
            }
            return linkedHashMap;
        } catch (IOException e) {
            throw new AkUnclassifiedErrorException(String.format("Cannot read all lines in %s", file.getAbsoluteFile()));
        }
    }

    @Override // com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer
    protected int convertTokenToIdImpl(String str) {
        return this.vocab.getOrDefault(str, this.vocab.get(this.specialTokenValues.get(PreTrainedTokenizer.SPECIAL_TOKENS.UNK_TOKEN))).intValue();
    }

    @Override // com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer
    int vocabSize() {
        return this.vocab.size();
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v6, types: [java.util.List] */
    @Override // com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer
    protected String[] tokenizeImpl(String str) {
        ArrayList arrayList = new ArrayList();
        if (this.doBasicTokenize) {
            for (String str2 : this.basicTokenizer.tokenize(str, Collections.emptySet())) {
                if (this.basicTokenizer.neverSplit.contains(str2)) {
                    arrayList.add(str2);
                } else {
                    arrayList.addAll(Arrays.asList(this.wordpieceTokenizer.tokenizer(str2)));
                }
            }
        } else {
            arrayList = Arrays.asList(this.wordpieceTokenizer.tokenizer(str));
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    public int[] buildInputsWithSpecialTokens(int[] iArr) {
        return buildInputsWithSpecialTokens(iArr, null);
    }

    @Override // com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer
    public int[] buildInputsWithSpecialTokens(int[] iArr, int[] iArr2) {
        int[] iArr3;
        int convertTokenToIdImpl = convertTokenToIdImpl(this.specialTokenValues.get(PreTrainedTokenizer.SPECIAL_TOKENS.CLS_TOKEN));
        int convertTokenToIdImpl2 = convertTokenToIdImpl(this.specialTokenValues.get(PreTrainedTokenizer.SPECIAL_TOKENS.SEP_TOKEN));
        if (null == iArr2) {
            iArr3 = new int[iArr.length + 2];
            iArr3[0] = convertTokenToIdImpl;
            System.arraycopy(iArr, 0, iArr3, 1, iArr.length);
            iArr3[iArr.length + 1] = convertTokenToIdImpl2;
        } else {
            iArr3 = new int[iArr.length + iArr2.length + 3];
            iArr3[0] = convertTokenToIdImpl;
            System.arraycopy(iArr, 0, iArr3, 1, iArr.length);
            iArr3[iArr.length + 1] = convertTokenToIdImpl2;
            System.arraycopy(iArr2, 0, iArr3, iArr.length + 2, iArr2.length);
            iArr3[iArr.length + iArr2.length + 2] = convertTokenToIdImpl2;
        }
        return iArr3;
    }

    @Override // com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer
    public int[] createTokenTypeIdsFromSequences(int[] iArr, int[] iArr2) {
        return null == iArr2 ? TokenizerUtils.nCopiesArray(0, iArr.length + 2) : TokenizerUtils.createArrayWithCopies(0, iArr.length + 2, 1, iArr2.length + 1);
    }

    @Override // com.alibaba.alink.operator.common.nlp.bert.tokenizer.PreTrainedTokenizer
    protected int[] getSpecialTokensMask(int[] iArr, int[] iArr2) {
        int[] iArr3;
        if (null == iArr2) {
            iArr3 = new int[iArr.length + 2];
            Arrays.fill(iArr3, 0);
            iArr3[0] = 1;
            iArr3[iArr.length + 1] = 1;
        } else {
            iArr3 = new int[iArr.length + iArr2.length + 3];
            Arrays.fill(iArr3, 0);
            iArr3[0] = 1;
            iArr3[iArr.length + 1] = 1;
            iArr3[iArr.length + iArr2.length + 2] = 1;
        }
        return iArr3;
    }
}
