package com.alibaba.alink.operator.common.dataproc;

import com.alibaba.alink.common.exceptions.AkUnsupportedOperationException;
import com.alibaba.alink.params.dataproc.HasStringOrderTypeDefaultAsRandom;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.MapPartitionFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.functions.RichGroupReduceFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.functions.RichMapPartitionFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.operators.SingleInputUdfOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.util.Collector;

/* loaded from: input_file:com/alibaba/alink/operator/common/dataproc/HugeStringIndexerUtil.class */
public class HugeStringIndexerUtil {
    public static DataSet<Tuple3<Integer, String, Long>> indexTokens(DataSet<Tuple2<Integer, String>> dataSet, HasStringOrderTypeDefaultAsRandom.StringOrderType stringOrderType, long j) {
        switch (stringOrderType) {
            case RANDOM:
                return indexRandom(dataSet, j);
            case FREQUENCY_ASC:
                return indexSortedByFreq(dataSet, j, true);
            case FREQUENCY_DESC:
                return indexSortedByFreq(dataSet, j, false);
            case ALPHABET_ASC:
                return indexSortedByAlphabet(dataSet, j, true);
            case ALPHABET_DESC:
                return indexSortedByAlphabet(dataSet, j, false);
            default:
                throw new AkUnsupportedOperationException("Unsupported order type " + stringOrderType);
        }
    }

    public static DataSet<Tuple3<Integer, String, Long>> indexRandom(DataSet<Tuple2<Integer, String>> dataSet, long j) {
        return zipWithIndex(distinct(dataSet), Long.valueOf(j));
    }

    public static DataSet<Tuple3<Integer, String, Long>> indexSortedByFreq(DataSet<Tuple2<Integer, String>> dataSet, final long j, final boolean z) {
        DataSet<Tuple3<Integer, String, Long>> countStringTokens = countStringTokens(dataSet);
        return countStringTokens.groupBy(new int[]{0, 2}).reduceGroup(new RichGroupReduceFunction<Tuple3<Integer, String, Long>, Tuple3<Integer, String, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.1
            HashMap<Long, Long>[] mapList = null;

            public void open(Configuration configuration) throws Exception {
                this.mapList = HugeStringIndexerUtil.sortCounts(getRuntimeContext().getBroadcastVariable("counts"), z, j);
            }

            public void reduce(Iterable<Tuple3<Integer, String, Long>> iterable, Collector<Tuple3<Integer, String, Long>> collector) throws Exception {
                Long l = -1L;
                for (Tuple3<Integer, String, Long> tuple3 : iterable) {
                    if (l.longValue() < 0) {
                        l = this.mapList[((Integer) tuple3.f0).intValue()].getOrDefault(tuple3.f2, Long.valueOf(j));
                    }
                    Object obj = tuple3.f0;
                    Object obj2 = tuple3.f1;
                    Long l2 = l;
                    l = Long.valueOf(l.longValue() + 1);
                    collector.collect(Tuple3.of(obj, obj2, l2));
                }
            }
        }).withBroadcastSet(countLongTokens(countStringTokens.project(new int[]{0, 2})), "counts").name("assign_index_sort_by_frequency").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.STRING, Types.LONG}));
    }

    public static HashMap<Long, Long>[] sortCounts(List<Tuple3<Integer, Long, Long>> list, final boolean z, long j) {
        HashMap hashMap = new HashMap();
        for (Tuple3<Integer, Long, Long> tuple3 : list) {
            ArrayList arrayList = (ArrayList) hashMap.getOrDefault(tuple3.f0, new ArrayList());
            arrayList.add(Tuple2.of(tuple3.f1, tuple3.f2));
            hashMap.put(tuple3.f0, arrayList);
        }
        HashMap[] hashMapArr = new HashMap[hashMap.size()];
        for (int i = 0; i < hashMapArr.length; i++) {
            hashMapArr[i] = new HashMap();
        }
        for (Map.Entry entry : hashMap.entrySet()) {
            ArrayList arrayList2 = (ArrayList) entry.getValue();
            arrayList2.sort(new Comparator<Tuple2<Long, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.2
                @Override // java.util.Comparator
                public int compare(Tuple2<Long, Long> tuple2, Tuple2<Long, Long> tuple22) {
                    return z ? Long.compare(((Long) tuple2.f0).longValue(), ((Long) tuple22.f0).longValue()) : Long.compare(((Long) tuple22.f0).longValue(), ((Long) tuple2.f0).longValue());
                }
            });
            long j2 = j;
            for (int i2 = 0; i2 < arrayList2.size(); i2++) {
                hashMapArr[((Integer) entry.getKey()).intValue()].put(((Tuple2) arrayList2.get(i2)).f0, Long.valueOf(j2));
                j2 += ((Long) ((Tuple2) arrayList2.get(i2)).f1).longValue();
            }
        }
        return hashMapArr;
    }

    public static DataSet<Tuple2<Integer, String>> distinct(DataSet<Tuple2<Integer, String>> dataSet) {
        return dataSet.groupBy(new int[]{0, 1}).reduce(new ReduceFunction<Tuple2<Integer, String>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.3
            private static final long serialVersionUID = 3246078624056103227L;

            public Tuple2<Integer, String> reduce(Tuple2<Integer, String> tuple2, Tuple2<Integer, String> tuple22) throws Exception {
                return tuple2;
            }
        }).name("distinct_tokens").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.STRING}));
    }

    public static <T> DataSet<Tuple3<Integer, T, Long>> countTokens(DataSet<Tuple2<Integer, T>> dataSet, TypeInformation typeInformation) {
        return dataSet.groupBy(new int[]{0, 1}).reduceGroup(new GroupReduceFunction<Tuple2<Integer, T>, Tuple3<Integer, T, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.4
            public void reduce(Iterable<Tuple2<Integer, T>> iterable, Collector<Tuple3<Integer, T, Long>> collector) throws Exception {
                int i = -1;
                Object obj = null;
                long j = 0;
                for (Tuple2<Integer, T> tuple2 : iterable) {
                    if (j == 0) {
                        i = ((Integer) tuple2.f0).intValue();
                        obj = tuple2.f1;
                    }
                    j++;
                }
                if (j != 0) {
                    collector.collect(Tuple3.of(Integer.valueOf(i), obj, Long.valueOf(j)));
                }
            }
        }).name("count_tokens").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, typeInformation, Types.LONG}));
    }

    public static DataSet<Tuple3<Integer, String, Long>> countStringTokens(DataSet<Tuple2<Integer, String>> dataSet) {
        return dataSet.groupBy(new int[]{0, 1}).reduceGroup(new GroupReduceFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.5
            public void reduce(Iterable<Tuple2<Integer, String>> iterable, Collector<Tuple3<Integer, String, Long>> collector) throws Exception {
                int i = -1;
                String str = null;
                long j = 0;
                for (Tuple2<Integer, String> tuple2 : iterable) {
                    if (j == 0) {
                        i = ((Integer) tuple2.f0).intValue();
                        str = (String) tuple2.f1;
                    }
                    j++;
                }
                if (j != 0) {
                    collector.collect(Tuple3.of(Integer.valueOf(i), str, Long.valueOf(j)));
                }
            }
        }).name("count_tokens").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.STRING, Types.LONG}));
    }

    public static DataSet<Tuple3<Integer, Long, Long>> countLongTokens(DataSet<Tuple2<Integer, Long>> dataSet) {
        return dataSet.groupBy(new int[]{0, 1}).reduceGroup(new GroupReduceFunction<Tuple2<Integer, Long>, Tuple3<Integer, Long, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.6
            public void reduce(Iterable<Tuple2<Integer, Long>> iterable, Collector<Tuple3<Integer, Long, Long>> collector) throws Exception {
                int i = -1;
                Long l = null;
                long j = 0;
                for (Tuple2<Integer, Long> tuple2 : iterable) {
                    if (j == 0) {
                        i = ((Integer) tuple2.f0).intValue();
                        l = (Long) tuple2.f1;
                    }
                    j++;
                }
                if (j != 0) {
                    collector.collect(Tuple3.of(Integer.valueOf(i), l, Long.valueOf(j)));
                }
            }
        }).name("count_tokens").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.LONG, Types.LONG}));
    }

    public static DataSet<Tuple3<Integer, String, Long>> indexSortedByAlphabet(DataSet<Tuple2<Integer, String>> dataSet, final long j, final boolean z) {
        SingleInputUdfOperator returns = distinct(dataSet).map(new RichMapFunction<Tuple2<Integer, String>, Tuple3<Integer, String, String>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.9
            HashMap<Integer, Integer> map = new HashMap<>();

            /* JADX WARN: Multi-variable type inference failed */
            public void open(Configuration configuration) throws Exception {
                super.open(configuration);
                for (Tuple2 tuple2 : getRuntimeContext().getBroadcastVariable("strMaxLen")) {
                    this.map.put(tuple2.f0, Integer.valueOf(Math.min(((Integer) tuple2.f1).intValue() / 4, 4)));
                }
            }

            public Tuple3<Integer, String, String> map(Tuple2<Integer, String> tuple2) throws Exception {
                return Tuple3.of(tuple2.f0, tuple2.f1, ((String) tuple2.f1).substring(0, Math.min(((String) tuple2.f1).length(), this.map.get(tuple2.f0).intValue())));
            }
        }).withBroadcastSet(dataSet.mapPartition(new MapPartitionFunction<Tuple2<Integer, String>, Tuple2<Integer, Integer>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.8
            public void mapPartition(Iterable<Tuple2<Integer, String>> iterable, Collector<Tuple2<Integer, Integer>> collector) throws Exception {
                HashMap hashMap = new HashMap();
                for (Tuple2<Integer, String> tuple2 : iterable) {
                    hashMap.put(tuple2.f0, Integer.valueOf(Math.max(((Integer) hashMap.getOrDefault(tuple2.f0, 0)).intValue(), ((String) tuple2.f1).length())));
                }
                hashMap.forEach((num, num2) -> {
                    collector.collect(Tuple2.of(num, num2));
                });
            }
        }).groupBy(new int[]{0}).reduce(new ReduceFunction<Tuple2<Integer, Integer>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.7
            public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> tuple2, Tuple2<Integer, Integer> tuple22) throws Exception {
                return Tuple2.of(tuple2.f0, Integer.valueOf(Math.max(((Integer) tuple2.f1).intValue(), ((Integer) tuple22.f1).intValue())));
            }
        }).name("compute_max_feature_length").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.INT})), "strMaxLen").name("generate_feature_prefix").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.STRING, Types.STRING}));
        return returns.groupBy(new int[]{0, 2}).reduceGroup(new RichGroupReduceFunction<Tuple3<Integer, String, String>, Tuple3<Integer, String, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.10
            HashMap<String, Long>[] mapList = null;

            public void open(Configuration configuration) throws Exception {
                List<Tuple3> broadcastVariable = getRuntimeContext().getBroadcastVariable("counts");
                HashMap hashMap = new HashMap();
                for (Tuple3 tuple3 : broadcastVariable) {
                    ArrayList arrayList = (ArrayList) hashMap.getOrDefault(tuple3.f0, new ArrayList());
                    arrayList.add(Tuple2.of(tuple3.f1, tuple3.f2));
                    hashMap.put(tuple3.f0, arrayList);
                }
                this.mapList = new HashMap[hashMap.size()];
                for (int i = 0; i < this.mapList.length; i++) {
                    this.mapList[i] = new HashMap<>();
                }
                for (Map.Entry entry : hashMap.entrySet()) {
                    ArrayList arrayList2 = (ArrayList) entry.getValue();
                    arrayList2.sort(new Comparator<Tuple2<String, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.10.1
                        @Override // java.util.Comparator
                        public int compare(Tuple2<String, Long> tuple2, Tuple2<String, Long> tuple22) {
                            return z ? String.CASE_INSENSITIVE_ORDER.compare(tuple2.f0, tuple22.f0) : String.CASE_INSENSITIVE_ORDER.compare(tuple22.f0, tuple2.f0);
                        }
                    });
                    long j2 = j;
                    for (int i2 = 0; i2 < arrayList2.size(); i2++) {
                        this.mapList[((Integer) entry.getKey()).intValue()].put(((Tuple2) arrayList2.get(i2)).f0, Long.valueOf(j2));
                        j2 += ((Long) ((Tuple2) arrayList2.get(i2)).f1).longValue();
                    }
                }
            }

            public void reduce(Iterable<Tuple3<Integer, String, String>> iterable, Collector<Tuple3<Integer, String, Long>> collector) throws Exception {
                ArrayList arrayList = new ArrayList();
                int i = 0;
                String str = "";
                for (Tuple3<Integer, String, String> tuple3 : iterable) {
                    i = ((Integer) tuple3.f0).intValue();
                    arrayList.add(tuple3.f1);
                    str = (String) tuple3.f2;
                }
                arrayList.sort(new Comparator<String>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.10.2
                    @Override // java.util.Comparator
                    public int compare(String str2, String str3) {
                        int compare = String.CASE_INSENSITIVE_ORDER.compare(str2, str3);
                        return z ? compare : 0 - compare;
                    }
                });
                Long orDefault = this.mapList[i].getOrDefault(str, Long.valueOf(j));
                for (int i2 = 0; i2 < arrayList.size(); i2++) {
                    collector.collect(Tuple3.of(Integer.valueOf(i), arrayList.get(i2), Long.valueOf(orDefault.longValue() + i2)));
                }
            }
        }).withBroadcastSet(countStringTokens(returns.project(new int[]{0, 2})), "counts").name("assign_index_alphabet_sort").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.STRING, Types.LONG}));
    }

    public static DataSet<Tuple3<Integer, Integer, Long>> countElementsPerPartition(DataSet<Tuple2<Integer, String>> dataSet) {
        return dataSet.mapPartition(new RichMapPartitionFunction<Tuple2<Integer, String>, Tuple3<Integer, Integer, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.11
            public void mapPartition(Iterable<Tuple2<Integer, String>> iterable, Collector<Tuple3<Integer, Integer, Long>> collector) throws Exception {
                HashMap hashMap = new HashMap();
                for (Tuple2<Integer, String> tuple2 : iterable) {
                    hashMap.put(tuple2.f0, Long.valueOf(((Long) hashMap.getOrDefault(tuple2.f0, 0L)).longValue() + 1));
                }
                int indexOfThisSubtask = getRuntimeContext().getIndexOfThisSubtask();
                for (Map.Entry entry : hashMap.entrySet()) {
                    collector.collect(Tuple3.of(Integer.valueOf(indexOfThisSubtask), entry.getKey(), entry.getValue()));
                }
            }
        }).name("count_elements_per_partition").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.INT, Types.LONG}));
    }

    public static DataSet<Tuple3<Integer, String, Long>> zipWithIndex(DataSet<Tuple2<Integer, String>> dataSet, final Long l) {
        return dataSet.mapPartition(new RichMapPartitionFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Long>>() { // from class: com.alibaba.alink.operator.common.dataproc.HugeStringIndexerUtil.12
            HashMap<Integer, Long> counter = new HashMap<>();

            /* JADX WARN: Multi-variable type inference failed */
            public void open(Configuration configuration) throws Exception {
                List<Tuple3> broadcastVariable = getRuntimeContext().getBroadcastVariable("counts");
                int indexOfThisSubtask = getRuntimeContext().getIndexOfThisSubtask();
                for (Tuple3 tuple3 : broadcastVariable) {
                    if (((Integer) tuple3.f0).intValue() < indexOfThisSubtask) {
                        this.counter.put(tuple3.f1, Long.valueOf(this.counter.getOrDefault(tuple3.f1, l).longValue() + ((Long) tuple3.f2).longValue()));
                    }
                }
            }

            /* JADX WARN: Multi-variable type inference failed */
            public void mapPartition(Iterable<Tuple2<Integer, String>> iterable, Collector<Tuple3<Integer, String, Long>> collector) throws Exception {
                for (Tuple2<Integer, String> tuple2 : iterable) {
                    long longValue = this.counter.getOrDefault(tuple2.f0, l).longValue();
                    collector.collect(Tuple3.of(tuple2.f0, tuple2.f1, Long.valueOf(longValue)));
                    this.counter.put(tuple2.f0, Long.valueOf(longValue + 1));
                }
            }
        }).withBroadcastSet(countElementsPerPartition(dataSet), "counts").name("assign_index").returns(new TupleTypeInfo(new TypeInformation[]{Types.INT, Types.STRING, Types.LONG}));
    }
}
