/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DelimitedToken;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.TokenTrieNode;

public class BasicTokenizer {
    private final boolean isLowerCase;
    private final boolean isTokenizeCjkChars;
    private final boolean isStripAccents;
    private final Set<String> neverSplitTokens;
    private final TokenTrieNode neverSplitTokenTrieRoot;

    public BasicTokenizer(boolean isLowerCase, boolean isTokenizeCjkChars, boolean isStripAccents, Set<String> neverSplit) {
        this.isLowerCase = isLowerCase;
        this.isTokenizeCjkChars = isTokenizeCjkChars;
        this.isStripAccents = isStripAccents;
        this.neverSplitTokens = neverSplit;
        this.neverSplitTokenTrieRoot = TokenTrieNode.build(neverSplit, this::doTokenizeString);
    }

    public BasicTokenizer(boolean isLowerCase, boolean isTokenizeCjkChars, boolean isStripAccents) {
        this(isLowerCase, isTokenizeCjkChars, isStripAccents, Collections.emptySet());
    }

    public BasicTokenizer(boolean isLowerCase, boolean isTokenizeCjkChars) {
        this(isLowerCase, isTokenizeCjkChars, isLowerCase);
    }

    BasicTokenizer() {
        this(true, true, true);
    }

    public List<DelimitedToken> tokenize(String text) {
        return this.mergeNeverSplitTokens(text, this.doTokenize(text));
    }

    private List<String> doTokenizeString(String text) {
        return this.doTokenize(text).stream().map(DelimitedToken::getToken).collect(Collectors.toList());
    }

    private List<DelimitedToken> doTokenize(String text) {
        text = BasicTokenizer.cleanText(text);
        if (this.isTokenizeCjkChars) {
            text = BasicTokenizer.tokenizeCjkChars(text);
        }
        List<DelimitedToken> tokens = BasicTokenizer.whiteSpaceTokenize(text);
        ArrayList<DelimitedToken> processedTokens = new ArrayList<DelimitedToken>(tokens.size());
        for (DelimitedToken tokenRecord : tokens) {
            String tokenStr = tokenRecord.getToken();
            if ("".equals(tokenStr)) continue;
            if (this.isLowerCase) {
                tokenStr = tokenStr.toLowerCase(Locale.ROOT);
            }
            if (this.isStripAccents) {
                tokenStr = BasicTokenizer.stripAccents(tokenStr);
            }
            processedTokens.addAll(BasicTokenizer.splitOnPunctuation(new DelimitedToken(tokenRecord.getStartPos(), tokenRecord.getEndPos(), tokenStr)));
        }
        return processedTokens;
    }

    private List<DelimitedToken> mergeNeverSplitTokens(String originalText, List<DelimitedToken> tokens) {
        if (this.neverSplitTokenTrieRoot.isLeaf()) {
            return tokens;
        }
        ArrayList<DelimitedToken> mergedTokens = new ArrayList<DelimitedToken>(tokens.size());
        ArrayList<DelimitedToken> matchingTokens = new ArrayList<DelimitedToken>();
        TokenTrieNode current = this.neverSplitTokenTrieRoot;
        for (DelimitedToken token : tokens) {
            TokenTrieNode childNode = current.getChild(token.getToken());
            if (childNode == null) {
                if (current != this.neverSplitTokenTrieRoot) {
                    mergedTokens.addAll(matchingTokens);
                    matchingTokens = new ArrayList();
                    current = this.neverSplitTokenTrieRoot;
                }
                if ((childNode = current.getChild(token.getToken())) == null) {
                    mergedTokens.add(token);
                    continue;
                }
                matchingTokens.add(token);
                current = childNode;
                continue;
            }
            if (childNode.isLeaf()) {
                matchingTokens.add(token);
                DelimitedToken mergedToken = DelimitedToken.mergeTokens(matchingTokens);
                String originalTokenText = originalText.substring(mergedToken.getStartPos(), mergedToken.getEndPos());
                if (this.neverSplitTokens.contains(originalTokenText)) {
                    mergedTokens.add(new DelimitedToken(mergedToken.getStartPos(), mergedToken.getEndPos(), originalTokenText));
                } else {
                    mergedTokens.addAll(matchingTokens);
                }
                matchingTokens = new ArrayList();
                current = this.neverSplitTokenTrieRoot;
                continue;
            }
            matchingTokens.add(token);
            current = childNode;
        }
        return mergedTokens;
    }

    public boolean isLowerCase() {
        return this.isLowerCase;
    }

    public boolean isStripAccents() {
        return this.isStripAccents;
    }

    public boolean isTokenizeCjkChars() {
        return this.isTokenizeCjkChars;
    }

    static List<DelimitedToken> whiteSpaceTokenize(String text) {
        int index;
        ArrayList<DelimitedToken> tokens = new ArrayList<DelimitedToken>();
        for (index = 0; index < text.length() && text.charAt(index) == ' '; ++index) {
        }
        int tokenStart = index;
        while (index < text.length()) {
            if (text.charAt(index) == ' ') {
                int tokenEnd = index++;
                while (index < text.length() && text.charAt(index) == ' ') {
                    ++index;
                }
                tokens.add(new DelimitedToken(tokenStart, tokenEnd, text.substring(tokenStart, tokenEnd)));
                tokenStart = index;
            }
            ++index;
        }
        if (tokenStart != text.length()) {
            tokens.add(new DelimitedToken(tokenStart, text.length(), text.substring(tokenStart)));
        }
        return tokens;
    }

    static String stripAccents(String word) {
        String normalizedString = Normalizer.normalize(word, Normalizer.Form.NFD);
        int[] codePoints = normalizedString.codePoints().filter(codePoint -> Character.getType(codePoint) != 6).toArray();
        return new String(codePoints, 0, codePoints.length);
    }

    static List<DelimitedToken> splitOnPunctuation(DelimitedToken word) {
        ArrayList<DelimitedToken> splits = new ArrayList<DelimitedToken>();
        int[] codePoints = word.getToken().codePoints().toArray();
        int lastSplit = 0;
        for (int i = 0; i < codePoints.length; ++i) {
            if (!BasicTokenizer.isPunctuationMark(codePoints[i])) continue;
            int charCount = i - lastSplit;
            if (charCount > 0) {
                splits.add(new DelimitedToken(word.getStartPos() + lastSplit, word.getStartPos() + i, new String(codePoints, lastSplit, i - lastSplit)));
            }
            splits.add(new DelimitedToken(word.getStartPos() + i, word.getStartPos() + i + 1, new String(codePoints, i, 1)));
            lastSplit = i + 1;
        }
        if (lastSplit < codePoints.length) {
            splits.add(new DelimitedToken(word.getStartPos() + lastSplit, word.getStartPos() + codePoints.length, new String(codePoints, lastSplit, codePoints.length - lastSplit)));
        }
        return splits;
    }

    static String tokenizeCjkChars(String text) {
        StringBuilder sb = new StringBuilder(text.length());
        AtomicBoolean cjkCharFound = new AtomicBoolean(false);
        text.codePoints().forEach(cp -> {
            if (BasicTokenizer.isCjkChar(cp)) {
                sb.append(' ');
                sb.appendCodePoint(cp);
                sb.append(' ');
                cjkCharFound.set(true);
            } else {
                sb.appendCodePoint(cp);
            }
        });
        if (!cjkCharFound.get()) {
            return text;
        }
        return sb.toString();
    }

    static String cleanText(String text) {
        int[] codePoints = text.codePoints().filter(codePoint -> !(codePoint == 0 || codePoint == 65533 || BasicTokenizer.isControlChar(codePoint))).map(codePoint -> BasicTokenizer.isWhiteSpace(codePoint) ? 32 : codePoint).toArray();
        return new String(codePoints, 0, codePoints.length);
    }

    static boolean isCjkChar(int codePoint) {
        Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
        return Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E.equals(block) || Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT.equals(block);
    }

    static boolean isControlChar(int codePoint) {
        if (codePoint == 10 || codePoint == 13 || codePoint == 9) {
            return false;
        }
        int category = Character.getType(codePoint);
        return category >= 15 && category <= 19;
    }

    static boolean isWhiteSpace(int codePoint) {
        if (codePoint == 10 || codePoint == 13 || codePoint == 9) {
            return true;
        }
        return Character.getType(codePoint) == 12;
    }

    static boolean isPunctuationMark(int codePoint) {
        if (codePoint >= 33 && codePoint <= 47 || codePoint >= 58 && codePoint <= 64 || codePoint >= 91 && codePoint <= 96 || codePoint >= 123 && codePoint <= 126) {
            return true;
        }
        int category = Character.getType(codePoint);
        return category >= 20 && category <= 24 || category >= 29 && category <= 30;
    }
}

