/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BasicTokenizer;

public class WordPieceTokenizer {
    private static final String CONTINUATION = "##";
    private final Map<String, Integer> vocab;
    private final String unknownToken;
    private final int maxInputCharsPerWord;

    public WordPieceTokenizer(Map<String, Integer> vocab, String unknownToken, int maxInputCharsPerWord) {
        this.vocab = vocab;
        this.unknownToken = unknownToken;
        this.maxInputCharsPerWord = maxInputCharsPerWord;
    }

    public List<TokenAndId> tokenize(String text) {
        String[] tokens = BasicTokenizer.whiteSpaceTokenize(text);
        ArrayList<TokenAndId> output = new ArrayList<TokenAndId>();
        for (String token : tokens) {
            if (token.length() > this.maxInputCharsPerWord) {
                assert (this.vocab.containsKey(this.unknownToken));
                output.add(new TokenAndId(this.unknownToken, this.vocab.get(this.unknownToken)));
                continue;
            }
            boolean isBad = false;
            int start = 0;
            ArrayList<TokenAndId> subTokens = new ArrayList<TokenAndId>();
            int length = token.length();
            while (start < length) {
                int end;
                Object currentValidSubStr = null;
                for (end = length; start < end; --end) {
                    Object subStr = start > 0 ? CONTINUATION + token.substring(start, end) : token.substring(start, end);
                    if (!this.vocab.containsKey(subStr)) continue;
                    currentValidSubStr = subStr;
                    break;
                }
                if (currentValidSubStr == null) {
                    isBad = true;
                    break;
                }
                subTokens.add(new TokenAndId((String)currentValidSubStr, this.vocab.get(currentValidSubStr)));
                start = end;
            }
            if (isBad) {
                output.add(new TokenAndId(this.unknownToken, this.vocab.get(this.unknownToken)));
                continue;
            }
            output.addAll(subTokens);
        }
        return output;
    }

    public static class TokenAndId {
        private final String token;
        private final int id;

        TokenAndId(String token, int id) {
            this.token = token;
            this.id = id;
        }

        public int getId() {
            return this.id;
        }

        public String getToken() {
            return this.token;
        }
    }
}

