/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DelimitedToken;

public class WordPieceTokenizer {
    private static final String CONTINUATION = "##";
    private final Map<String, Integer> vocab;
    private final String unknownToken;
    private final int maxInputCharsPerWord;

    public WordPieceTokenizer(Map<String, Integer> vocab, String unknownToken, int maxInputCharsPerWord) {
        this.vocab = vocab;
        this.unknownToken = unknownToken;
        this.maxInputCharsPerWord = maxInputCharsPerWord;
    }

    public List<Integer> tokenize(DelimitedToken token) {
        if (token.getToken().length() > this.maxInputCharsPerWord) {
            assert (this.vocab.containsKey(this.unknownToken));
            return Collections.singletonList(this.vocab.get(this.unknownToken));
        }
        ArrayList<Integer> output = new ArrayList<Integer>();
        boolean isBad = false;
        int start = 0;
        int length = token.getToken().length();
        while (start < length) {
            int end;
            Object currentValidSubStr = null;
            for (end = length; start < end; --end) {
                Object subStr = start > 0 ? CONTINUATION + token.getToken().substring(start, end) : token.getToken().substring(start, end);
                if (!this.vocab.containsKey(subStr)) continue;
                currentValidSubStr = subStr;
                break;
            }
            if (currentValidSubStr == null) {
                isBad = true;
                break;
            }
            output.add(this.vocab.get(currentValidSubStr));
            start = end;
        }
        if (isBad) {
            return Collections.singletonList(this.vocab.get(this.unknownToken));
        }
        return output;
    }
}

