/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.transform.tokenize;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.sysds.runtime.matrix.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.Tokenizer;
import org.apache.sysds.runtime.transform.tokenize.TokenizerPre;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

public class TokenizerPreWhitespaceSplit
implements TokenizerPre {
    private static final long serialVersionUID = 539127244034913364L;
    private final Params params;
    private final List<Integer> idCols;
    private final int tokenizeCol;

    public TokenizerPreWhitespaceSplit(List<Integer> idCols, int tokenizeCol, JSONObject params) throws JSONException {
        this.idCols = idCols;
        this.tokenizeCol = tokenizeCol;
        this.params = new Params(params);
    }

    public List<Tokenizer.Token> splitToTokens(String text) {
        ArrayList<Tokenizer.Token> tokenList = new ArrayList<Tokenizer.Token>();
        String[] textTokens = text.split(this.params.regex);
        int curIndex = 0;
        for (String textToken : textTokens) {
            int tokenIndex;
            curIndex = tokenIndex = text.indexOf(textToken, curIndex);
            tokenList.add(new Tokenizer.Token(textToken, tokenIndex));
        }
        return tokenList;
    }

    @Override
    public List<Tokenizer.DocumentToTokens> tokenizePre(FrameBlock in) {
        ArrayList<Tokenizer.DocumentToTokens> documentsToTokenList = new ArrayList<Tokenizer.DocumentToTokens>();
        Iterator<String[]> iterator = in.getStringRowIterator();
        iterator.forEachRemaining(s -> {
            String text = s[this.tokenizeCol - 1];
            ArrayList<Object> keys = new ArrayList<Object>();
            for (Integer idCol : this.idCols) {
                String key = s[idCol - 1];
                keys.add(key);
            }
            List<Tokenizer.Token> tokenList = this.splitToTokens(text);
            documentsToTokenList.add(new Tokenizer.DocumentToTokens(keys, tokenList));
        });
        return documentsToTokenList;
    }

    static class Params
    implements Serializable {
        private static final long serialVersionUID = -4368552847660442628L;
        public String regex = "\\s+";

        public Params(JSONObject json) throws JSONException {
            if (json != null && json.has("regex")) {
                this.regex = json.getString("regex");
            }
        }
    }
}

