package com.alibaba.alink.operator.common.nlp;

import com.alibaba.alink.common.mapper.SISOMapper;
import com.alibaba.alink.params.nlp.RegexTokenizerParams;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.ml.api.misc.param.Params;
import org.apache.flink.table.api.TableSchema;

/* loaded from: input_file:com/alibaba/alink/operator/common/nlp/RegexTokenizerMapper.class */
public class RegexTokenizerMapper extends SISOMapper {
    private static final long serialVersionUID = -8361496868141967672L;
    private Pattern pattern;
    private final String patternStr;
    private final boolean toLowerCase;
    private final boolean gaps;
    private final int minTokenLength;

    public RegexTokenizerMapper(TableSchema tableSchema, Params params) {
        super(tableSchema, params);
        this.patternStr = (String) this.params.get(RegexTokenizerParams.PATTERN);
        this.minTokenLength = ((Integer) this.params.get(RegexTokenizerParams.MIN_TOKEN_LENGTH)).intValue();
        this.toLowerCase = ((Boolean) this.params.get(RegexTokenizerParams.TO_LOWER_CASE)).booleanValue();
        this.gaps = ((Boolean) this.params.get(RegexTokenizerParams.GAPS)).booleanValue();
        if (this.gaps) {
            return;
        }
        this.pattern = Pattern.compile(this.patternStr);
    }

    @Override // com.alibaba.alink.common.mapper.SISOMapper
    public TypeInformation initOutputColType() {
        return Types.STRING;
    }

    @Override // com.alibaba.alink.common.mapper.SISOMapper
    public Object mapColumn(Object obj) {
        if (null == obj) {
            return null;
        }
        String str = (String) obj;
        if (this.toLowerCase) {
            str = str.toLowerCase();
        }
        boolean z = true;
        StringBuilder sb = new StringBuilder();
        if (this.gaps) {
            for (String str2 : str.split(this.patternStr)) {
                if (str2.length() >= this.minTokenLength) {
                    if (z) {
                        sb.append(str2);
                        z = false;
                    } else {
                        sb.append(" ").append(str2);
                    }
                }
            }
        } else {
            Matcher matcher = this.pattern.matcher(str);
            while (matcher.find()) {
                String group = matcher.group();
                if (group.length() >= this.minTokenLength) {
                    if (z) {
                        sb.append(group);
                        z = false;
                    } else {
                        sb.append(" ").append(group);
                    }
                }
            }
        }
        return sb.toString();
    }
}
