package com.alibaba.alink.operator.common.similarity.dataConverter;

import com.alibaba.alink.common.utils.JsonConverter;
import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.common.distance.SimHashHammingDistance;
import com.alibaba.alink.operator.common.nlp.WordCountUtil;
import com.alibaba.alink.operator.common.similarity.Sample;
import com.alibaba.alink.operator.common.similarity.modeldata.SimHashModelData;
import com.alibaba.alink.operator.common.similarity.similarity.SimHashHammingSimilarity;
import com.alibaba.alink.params.similarity.StringTextApproxNearestNeighborTrainParams;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapPartitionFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.ml.api.misc.param.Params;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.type.TypeReference;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;
import scala.util.hashing.MurmurHash3;

/* loaded from: input_file:com/alibaba/alink/operator/common/similarity/dataConverter/SimHashModelDataConverter.class */
public class SimHashModelDataConverter extends NearestNeighborDataConverter<SimHashModelData> {
    private static final long serialVersionUID = -7071296061894969237L;
    private static int ROW_SIZE = 2;
    private static int BUCKETS_INDEX = 0;
    private static int HASHVALUE_IDNEX = 1;
    private static int MAX_ID_NUMBER = WordCountUtil.BOUND_SIZE;
    private static MurmurHash3 HASH = new MurmurHash3();

    public SimHashModelDataConverter() {
        this.rowSize = ROW_SIZE;
    }

    @Override // com.alibaba.alink.operator.common.similarity.dataConverter.NearestNeighborDataConverter
    public TableSchema getModelDataSchema() {
        return new TableSchema(new String[]{"BUCKETS", "HASHVALUE"}, new TypeInformation[]{Types.STRING, Types.STRING});
    }

    /* JADX WARN: Can't rename method to resolve collision */
    /* JADX WARN: Type inference failed for: r1v15, types: [com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter$2] */
    /* JADX WARN: Type inference failed for: r1v5, types: [com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter$1] */
    @Override // com.alibaba.alink.operator.common.similarity.dataConverter.NearestNeighborDataConverter
    public SimHashModelData loadModelData(List<Row> list) {
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        for (Row row : list) {
            if (row.getField(BUCKETS_INDEX) != null) {
                Tuple2 tuple2 = (Tuple2) JsonConverter.fromJson((String) row.getField(BUCKETS_INDEX), new TypeReference<Tuple2<Integer, List<Object>>>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.1
                }.getType());
                List list2 = (List) hashMap.get(tuple2.f0);
                if (null != list2) {
                    ((List) tuple2.f1).addAll(list2);
                }
                hashMap.put(tuple2.f0, tuple2.f1);
            } else if (row.getField(HASHVALUE_IDNEX) != null) {
                Tuple2 tuple22 = (Tuple2) JsonConverter.fromJson((String) row.getField(HASHVALUE_IDNEX), new TypeReference<Tuple2<Object, BigInteger>>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.2
                }.getType());
                hashMap2.put(tuple22.f0, tuple22.f1);
            }
        }
        return new SimHashModelData(hashMap, hashMap2, (StringTextApproxNearestNeighborTrainParams.Metric) this.meta.get(StringTextApproxNearestNeighborTrainParams.METRIC), ((Boolean) this.meta.get(StringModelDataConverter.TEXT)).booleanValue());
    }

    @Override // com.alibaba.alink.operator.common.similarity.dataConverter.NearestNeighborDataConverter
    public DataSet<Row> buildIndex(BatchOperator batchOperator, final Params params) {
        DataSet<Row> dataSet = batchOperator.getDataSet();
        final SimHashHammingSimilarity simHashHammingSimilarity = new SimHashHammingSimilarity();
        final boolean booleanValue = ((Boolean) params.get(StringModelDataConverter.TEXT)).booleanValue();
        MapOperator map = dataSet.map(new MapFunction<Row, Tuple2<Object, BigInteger>>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.3
            private static final long serialVersionUID = 5013768299997778182L;

            public Tuple2<Object, BigInteger> map(Row row) throws Exception {
                String str = (String) row.getField(1);
                Object field = row.getField(0);
                SimHashHammingDistance simHashHammingDistance = (SimHashHammingDistance) simHashHammingSimilarity.getDistance();
                return Tuple2.of(field, booleanValue ? simHashHammingDistance.simHash(Sample.split(str)) : simHashHammingDistance.simHash(str));
            }
        });
        return map.map(new MapFunction<Tuple2<Object, BigInteger>, Row>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.6
            private static final long serialVersionUID = 8369275450085183950L;

            public Row map(Tuple2<Object, BigInteger> tuple2) throws Exception {
                Row row = new Row(SimHashModelDataConverter.ROW_SIZE);
                row.setField(SimHashModelDataConverter.HASHVALUE_IDNEX, JsonConverter.toJson(tuple2));
                return row;
            }
        }).union(map.flatMap(new FlatMapFunction<Tuple2<Object, BigInteger>, Tuple2<Object, Integer>>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.5
            private static final long serialVersionUID = -8190719852493750248L;

            public void flatMap(Tuple2<Object, BigInteger> tuple2, Collector<Tuple2<Object, Integer>> collector) throws Exception {
                for (int i : SimHashModelDataConverter.splitBigInteger((BigInteger) tuple2.f1)) {
                    collector.collect(Tuple2.of(tuple2.f0, Integer.valueOf(i)));
                }
            }

            public /* bridge */ /* synthetic */ void flatMap(Object obj, Collector collector) throws Exception {
                flatMap((Tuple2<Object, BigInteger>) obj, (Collector<Tuple2<Object, Integer>>) collector);
            }
        }).groupBy(new int[]{1}).reduceGroup(new GroupReduceFunction<Tuple2<Object, Integer>, Row>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.4
            private static final long serialVersionUID = -4943198776190780076L;

            public void reduce(Iterable<Tuple2<Object, Integer>> iterable, Collector<Row> collector) throws Exception {
                ArrayList arrayList = new ArrayList(SimHashModelDataConverter.MAX_ID_NUMBER);
                Integer num = null;
                Row row = new Row(SimHashModelDataConverter.ROW_SIZE);
                for (Tuple2<Object, Integer> tuple2 : iterable) {
                    arrayList.add(tuple2.f0);
                    if (null == num) {
                        num = (Integer) tuple2.f1;
                    }
                    if (arrayList.size() > SimHashModelDataConverter.MAX_ID_NUMBER) {
                        row.setField(SimHashModelDataConverter.BUCKETS_INDEX, JsonConverter.toJson(Tuple2.of(num, arrayList)));
                        collector.collect(row);
                        arrayList.clear();
                    }
                }
                row.setField(SimHashModelDataConverter.BUCKETS_INDEX, JsonConverter.toJson(Tuple2.of(num, arrayList)));
                collector.collect(row);
            }
        })).mapPartition(new RichMapPartitionFunction<Row, Row>() { // from class: com.alibaba.alink.operator.common.similarity.dataConverter.SimHashModelDataConverter.7
            private static final long serialVersionUID = 4816437180146626035L;

            public void mapPartition(Iterable<Row> iterable, Collector<Row> collector) throws Exception {
                Params params2 = null;
                if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                    params2 = params;
                }
                new SimHashModelDataConverter().save2(Tuple2.of(params2, iterable), collector);
            }
        }).name("build_model");
    }

    public static int[] splitBigInteger(BigInteger bigInteger) {
        int[] iArr = new int[4];
        for (int i = 0; i < 4; i++) {
            int i2 = 16 * (3 - i);
            BigInteger shiftRight = bigInteger.shiftRight(i2);
            bigInteger = bigInteger.subtract(shiftRight.shiftLeft(i2));
            iArr[i] = HASH.arrayHash(new Integer[]{Integer.valueOf(i), Integer.valueOf(shiftRight.intValue())}, 0);
        }
        return iArr;
    }

    @Override // com.alibaba.alink.operator.common.similarity.dataConverter.NearestNeighborDataConverter
    public /* bridge */ /* synthetic */ SimHashModelData loadModelData(List list) {
        return loadModelData((List<Row>) list);
    }
}
