/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.matrix.data;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.sysml.lops.PartialAggregate;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.codegen.SpoofOperator;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.functionobjects.Builtin;
import org.apache.sysml.runtime.functionobjects.CM;
import org.apache.sysml.runtime.functionobjects.IndexFunction;
import org.apache.sysml.runtime.functionobjects.KahanFunction;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
import org.apache.sysml.runtime.functionobjects.Mean;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.ReduceAll;
import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.functionobjects.ReduceDiag;
import org.apache.sysml.runtime.functionobjects.ReduceRow;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysml.runtime.instructions.cp.Data;
import org.apache.sysml.runtime.instructions.cp.KahanObject;
import org.apache.sysml.runtime.matrix.data.DenseBlock;
import org.apache.sysml.runtime.matrix.data.DenseBlockFactory;
import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.MatrixValue;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
import org.apache.sysml.runtime.matrix.data.SparseBlockFactory;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateTernaryOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.runtime.matrix.operators.UnaryOperator;
import org.apache.sysml.runtime.util.CommonThreadPool;
import org.apache.sysml.runtime.util.DataConverter;
import org.apache.sysml.runtime.util.UtilFunctions;

public class LibMatrixAgg {
    private static final boolean NAN_AWARENESS = false;
    private static final long PAR_NUMCELL_THRESHOLD1 = 0x100000L;
    private static final long PAR_NUMCELL_THRESHOLD2 = 16384L;
    private static final long PAR_INTERMEDIATE_SIZE_THRESHOLD = 0x200000L;

    private LibMatrixAgg() {
    }

    public static void aggregateBinaryMatrix(MatrixBlock in, MatrixBlock aggVal, MatrixBlock aggCorr, boolean deep) {
        if (in.isEmptyBlock(false)) {
            return;
        }
        if (!deep && aggVal.isEmptyBlock(false)) {
            aggVal.copyShallow(in);
            return;
        }
        if (aggVal.sparse && aggVal.isAllocated() && aggVal.getSparseBlock() instanceof SparseBlockCSR) {
            aggVal.sparseBlock = SparseBlockFactory.copySparseBlock(SparseBlock.Type.MCSR, aggVal.getSparseBlock(), true);
        }
        if (aggCorr.sparse && aggCorr.isAllocated() && aggCorr.getSparseBlock() instanceof SparseBlockCSR) {
            aggCorr.sparseBlock = SparseBlockFactory.copySparseBlock(SparseBlock.Type.MCSR, aggCorr.getSparseBlock(), true);
        }
        if (!(in.sparse || aggVal.sparse || aggCorr.sparse)) {
            LibMatrixAgg.aggregateBinaryMatrixAllDense(in, aggVal, aggCorr);
        } else if (in.sparse && !aggVal.sparse && !aggCorr.sparse) {
            LibMatrixAgg.aggregateBinaryMatrixSparseDense(in, aggVal, aggCorr);
        } else if (in.sparse) {
            LibMatrixAgg.aggregateBinaryMatrixSparseGeneric(in, aggVal, aggCorr);
        } else {
            LibMatrixAgg.aggregateBinaryMatrixDenseGeneric(in, aggVal, aggCorr);
        }
    }

    public static void aggregateBinaryMatrix(MatrixBlock in, MatrixBlock aggVal, AggregateOperator aop) {
        boolean lastColCorr;
        if (in.getNumRows() != aggVal.getNumRows() || in.getNumColumns() != aggVal.getNumColumns()) {
            throw new DMLRuntimeException("Dimension mismatch on aggregate: " + in.getNumRows() + "x" + in.getNumColumns() + " vs " + aggVal.getNumRows() + "x" + aggVal.getNumColumns());
        }
        boolean lastRowCorr = aop.correctionLocation == PartialAggregate.CorrectionLocationType.LASTROW;
        boolean bl = lastColCorr = aop.correctionLocation == PartialAggregate.CorrectionLocationType.LASTCOLUMN;
        if (!in.sparse && lastRowCorr) {
            LibMatrixAgg.aggregateBinaryMatrixLastRowDenseGeneric(in, aggVal);
        } else if (in.sparse && lastRowCorr) {
            LibMatrixAgg.aggregateBinaryMatrixLastRowSparseGeneric(in, aggVal);
        } else if (!in.sparse && lastColCorr) {
            LibMatrixAgg.aggregateBinaryMatrixLastColDenseGeneric(in, aggVal);
        } else {
            LibMatrixAgg.aggregateBinaryMatrixLastColSparseGeneric(in, aggVal);
        }
    }

    public static void aggregateUnaryMatrix(MatrixBlock in, MatrixBlock out, AggregateUnaryOperator uaop) {
        AggType aggtype = LibMatrixAgg.getAggType(uaop);
        int m = in.rlen;
        int m2 = out.rlen;
        int n2 = out.clen;
        if (in.isEmptyBlock(false)) {
            LibMatrixAgg.aggregateUnaryMatrixEmpty(in, out, aggtype, uaop.indexFn);
            return;
        }
        out.reset(m2, n2, false);
        out.allocateDenseBlock();
        if (!in.sparse) {
            LibMatrixAgg.aggregateUnaryMatrixDense(in, out, aggtype, uaop.aggOp.increOp.fn, uaop.indexFn, 0, m);
        } else {
            LibMatrixAgg.aggregateUnaryMatrixSparse(in, out, aggtype, uaop.aggOp.increOp.fn, uaop.indexFn, 0, m);
        }
        out.recomputeNonZeros();
        out.examSparsity();
    }

    public static void aggregateUnaryMatrix(MatrixBlock in, MatrixBlock out, AggregateUnaryOperator uaop, int k) {
        if (!LibMatrixAgg.satisfiesMultiThreadingConstraints(in, out, uaop, k)) {
            LibMatrixAgg.aggregateUnaryMatrix(in, out, uaop);
            return;
        }
        AggType aggtype = LibMatrixAgg.getAggType(uaop);
        int m = in.rlen;
        int m2 = out.rlen;
        int n2 = out.clen;
        if (in.isEmptyBlock(false)) {
            LibMatrixAgg.aggregateUnaryMatrixEmpty(in, out, aggtype, uaop.indexFn);
            return;
        }
        if (uaop.indexFn instanceof ReduceCol) {
            out.reset(m2, n2, false);
            out.allocateDenseBlock();
        }
        try {
            int i;
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<RowAggTask> tasks = new ArrayList<RowAggTask>();
            ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(m, k, uaop.indexFn instanceof ReduceRow);
            int lb = 0;
            for (i = 0; i < blklens.size(); ++i) {
                tasks.add((RowAggTask)(uaop.indexFn instanceof ReduceCol ? new RowAggTask(in, out, aggtype, uaop, lb, lb + blklens.get(i)) : new PartialAggTask(in, out, aggtype, uaop, lb, lb + blklens.get(i))));
                lb += blklens.get(i).intValue();
            }
            pool.invokeAll(tasks);
            pool.shutdown();
            if (!(uaop.indexFn instanceof ReduceCol)) {
                out.copy(((PartialAggTask)tasks.get(0)).getResult());
                for (i = 1; i < tasks.size(); ++i) {
                    LibMatrixAgg.aggregateFinalResult(uaop.aggOp, out, ((PartialAggTask)tasks.get(i)).getResult());
                }
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        out.recomputeNonZeros();
        out.examSparsity();
    }

    public static MatrixBlock cumaggregateUnaryMatrix(MatrixBlock in, MatrixBlock out, UnaryOperator uop) {
        AggType aggtype = LibMatrixAgg.getAggType(uop);
        int m = in.rlen;
        int m2 = out.rlen;
        int n2 = out.clen;
        if (in.isEmptyBlock(false)) {
            return LibMatrixAgg.aggregateUnaryMatrixEmpty(in, out, aggtype, null);
        }
        out.reset(m2, n2, false);
        out.allocateDenseBlock();
        if (!in.sparse) {
            LibMatrixAgg.cumaggregateUnaryMatrixDense(in, out, aggtype, uop.fn, null, 0, m);
        } else {
            LibMatrixAgg.cumaggregateUnaryMatrixSparse(in, out, aggtype, uop.fn, null, 0, m);
        }
        out.recomputeNonZeros();
        out.examSparsity();
        return out;
    }

    public static MatrixBlock cumaggregateUnaryMatrix(MatrixBlock in, MatrixBlock out, UnaryOperator uop, int k) {
        int mk;
        AggregateUnaryOperator uaop = InstructionUtils.parseBasicCumulativeAggregateUnaryOperator(uop);
        if (k <= 1 || (long)in.rlen * (long)in.clen < 0x100000L || in.rlen <= k || (long)(out.clen * 8 * k) > 0x200000L || uaop == null || !out.isThreadSafe()) {
            return LibMatrixAgg.cumaggregateUnaryMatrix(in, out, uop);
        }
        AggType aggtype = LibMatrixAgg.getAggType(uop);
        int m = in.rlen;
        int m2 = out.rlen;
        int n2 = out.clen;
        int n = mk = aggtype == AggType.CUM_KAHAN_SUM ? 2 : 1;
        if (in.isEmptyBlock(false)) {
            return LibMatrixAgg.aggregateUnaryMatrixEmpty(in, out, aggtype, null);
        }
        out.reset(m2, n2, false);
        out.allocateDenseBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            int blklen = (int)Math.ceil((double)m / (double)k);
            AggType uaoptype = LibMatrixAgg.getAggType(uaop);
            ArrayList<PartialAggTask> tasks = new ArrayList<PartialAggTask>();
            int i = 0;
            while (i < k & i * blklen < m) {
                tasks.add(new PartialAggTask(in, new MatrixBlock(mk, n2, false), uaoptype, uaop, i * blklen, Math.min((i + 1) * blklen, m)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            for (Future task : taskret) {
                task.get();
            }
            MatrixBlock tmp = new MatrixBlock(tasks.size(), n2, false);
            for (int i2 = 0; i2 < tasks.size(); ++i2) {
                MatrixBlock row = ((PartialAggTask)tasks.get(i2)).getResult();
                if (uaop.aggOp.correctionExists) {
                    row.dropLastRowsOrColumns(uaop.aggOp.correctionLocation);
                }
                tmp.leftIndexingOperations(row, i2, i2, 0, n2 - 1, tmp, MatrixObject.UpdateType.INPLACE_PINNED);
            }
            MatrixBlock tmp2 = LibMatrixAgg.cumaggregateUnaryMatrix(tmp, new MatrixBlock(tasks.size(), n2, false), uop);
            ArrayList<CumAggTask> tasks2 = new ArrayList<CumAggTask>();
            int i3 = 0;
            while (i3 < k & i3 * blklen < m) {
                double[] agg = i3 == 0 ? null : DataConverter.convertToDoubleVector(tmp2.slice(i3 - 1, i3 - 1, 0, n2 - 1, new MatrixBlock()), false);
                tasks2.add(new CumAggTask(in, agg, out, aggtype, uop, i3 * blklen, Math.min((i3 + 1) * blklen, m)));
                ++i3;
            }
            List taskret2 = pool.invokeAll(tasks2);
            pool.shutdown();
            out.nonZeros = 0L;
            for (Future task : taskret2) {
                out.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        out.examSparsity();
        return out;
    }

    public static MatrixBlock aggregateTernary(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, MatrixBlock ret, AggregateTernaryOperator op) {
        if (in1.isEmptyBlock(false) || in2.isEmptyBlock(false) || in3 != null && in3.isEmptyBlock(false)) {
            return ret;
        }
        ret.reset(ret.rlen, ret.clen, false);
        ret.allocateDenseBlock();
        IndexFunction ixFn = op.indexFn;
        if (!(in1.sparse || in2.sparse || in3 != null && in3.sparse)) {
            LibMatrixAgg.aggregateTernaryDense(in1, in2, in3, ret, ixFn, 0, in1.rlen);
        } else {
            LibMatrixAgg.aggregateTernaryGeneric(in1, in2, in3, ret, ixFn, 0, in1.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
        return ret;
    }

    public static MatrixBlock aggregateTernary(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, MatrixBlock ret, AggregateTernaryOperator op, int k) {
        if (k <= 1 || in1.nonZeros + in2.nonZeros < 0x100000L || in1.rlen <= k / 2 || !(op.indexFn instanceof ReduceCol) && (long)(ret.clen * 8 * k) > 0x200000L) {
            return LibMatrixAgg.aggregateTernary(in1, in2, in3, ret, op);
        }
        if (in1.isEmptyBlock(false) || in2.isEmptyBlock(false) || in3 != null && in3.isEmptyBlock(false)) {
            return ret;
        }
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<AggTernaryTask> tasks = new ArrayList<AggTernaryTask>();
            int blklen = (int)Math.ceil((double)in1.rlen / (double)k);
            IndexFunction ixFn = op.indexFn;
            int i = 0;
            while (i < k & i * blklen < in1.rlen) {
                tasks.add(new AggTernaryTask(in1, in2, in3, ret, ixFn, i * blklen, Math.min((i + 1) * blklen, in1.rlen)));
                ++i;
            }
            List rtasks = pool.invokeAll(tasks);
            pool.shutdown();
            ret.copy((MatrixValue)rtasks.get(0).get());
            for (int i2 = 1; i2 < rtasks.size(); ++i2) {
                LibMatrixAgg.aggregateFinalResult(op.aggOp, ret, (MatrixBlock)rtasks.get(i2).get());
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
        return ret;
    }

    public static void groupedAggregate(MatrixBlock groups, MatrixBlock target, MatrixBlock weights, MatrixBlock result, int numGroups, Operator op) {
        if (!(op instanceof CMOperator) && !(op instanceof AggregateOperator)) {
            throw new DMLRuntimeException("Invalid operator (" + op + ") encountered while processing groupedAggregate.");
        }
        if (op instanceof CMOperator) {
            CMOperator cmOp = (CMOperator)op;
            if (cmOp.getAggOpType() == CMOperator.AggregateOperationTypes.COUNT && weights == null && target.clen == 1) {
                LibMatrixAgg.groupedAggregateVecCount(groups, result, numGroups);
            } else {
                LibMatrixAgg.groupedAggregateCM(groups, target, weights, result, numGroups, cmOp, 0, target.clen);
            }
        } else if (op instanceof AggregateOperator) {
            AggregateOperator aggop = (AggregateOperator)op;
            LibMatrixAgg.groupedAggregateKahanPlus(groups, target, weights, result, numGroups, aggop, 0, target.clen);
        }
        result.examSparsity();
    }

    public static void groupedAggregate(MatrixBlock groups, MatrixBlock target, MatrixBlock weights, MatrixBlock result, int numGroups, Operator op, int k) {
        boolean rowVector;
        boolean bl = rowVector = target.getNumRows() == 1 && target.getNumColumns() > 1;
        if (k <= 1 || (long)target.rlen * (long)target.clen < 0x100000L || rowVector || target.clen == 1) {
            LibMatrixAgg.groupedAggregate(groups, target, weights, result, numGroups, op);
            return;
        }
        if (!(op instanceof CMOperator) && !(op instanceof AggregateOperator)) {
            throw new DMLRuntimeException("Invalid operator (" + op + ") encountered while processing groupedAggregate.");
        }
        result.sparse = false;
        result.allocateDenseBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<GrpAggTask> tasks = new ArrayList<GrpAggTask>();
            int blklen = (int)Math.ceil((double)target.clen / (double)k);
            int i = 0;
            while (i < k & i * blklen < target.clen) {
                tasks.add(new GrpAggTask(groups, target, weights, result, numGroups, op, i * blklen, Math.min((i + 1) * blklen, target.clen)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            for (Future task : taskret) {
                task.get();
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        result.recomputeNonZeros();
        result.examSparsity();
    }

    public static boolean isSupportedUnaryAggregateOperator(AggregateUnaryOperator op) {
        AggType type = LibMatrixAgg.getAggType(op);
        return type != AggType.INVALID;
    }

    public static boolean isSupportedUnaryOperator(UnaryOperator op) {
        AggType type = LibMatrixAgg.getAggType(op);
        return type != AggType.INVALID;
    }

    public static boolean satisfiesMultiThreadingConstraints(MatrixBlock in, MatrixBlock out, AggregateUnaryOperator uaop, int k) {
        boolean sharedTP;
        boolean bl = sharedTP = InfrastructureAnalyzer.getLocalParallelism() == k;
        return k > 1 && out.isThreadSafe() && in.rlen > (sharedTP ? k / 8 : k / 2) && (uaop.indexFn instanceof ReduceCol || (long)(out.clen * 8 * k) < 0x200000L) && in.nonZeros > (sharedTP ? 16384L : 0x100000L);
    }

    public static void recomputeIndexes(MatrixBlock out, AggregateUnaryOperator op, int brlen, int bclen, MatrixIndexes ix) {
        AggType type = LibMatrixAgg.getAggType(op);
        if ((type == AggType.MAX_INDEX || type == AggType.MIN_INDEX) && ix.getColumnIndex() != 1L) {
            int m = out.rlen;
            double[] c = out.getDenseBlockValues();
            int i = 0;
            int cix = 0;
            while (i < m) {
                c[cix] = UtilFunctions.computeCellIndex(ix.getColumnIndex(), bclen, (int)c[cix] - 1);
                ++i;
                cix += 2;
            }
        }
    }

    private static AggType getAggType(AggregateUnaryOperator op) {
        ValueFunction vfn = op.aggOp.increOp.fn;
        IndexFunction ifn = op.indexFn;
        if (vfn instanceof KahanFunction && (op.aggOp.correctionLocation == PartialAggregate.CorrectionLocationType.LASTCOLUMN || op.aggOp.correctionLocation == PartialAggregate.CorrectionLocationType.LASTROW) && (ifn instanceof ReduceAll || ifn instanceof ReduceCol || ifn instanceof ReduceRow || ifn instanceof ReduceDiag)) {
            if (vfn instanceof KahanPlus) {
                return AggType.KAHAN_SUM;
            }
            if (vfn instanceof KahanPlusSq) {
                return AggType.KAHAN_SUM_SQ;
            }
        }
        if (vfn instanceof Mean && (op.aggOp.correctionLocation == PartialAggregate.CorrectionLocationType.LASTTWOCOLUMNS || op.aggOp.correctionLocation == PartialAggregate.CorrectionLocationType.LASTTWOROWS) && (ifn instanceof ReduceAll || ifn instanceof ReduceCol || ifn instanceof ReduceRow)) {
            return AggType.MEAN;
        }
        if (vfn instanceof CM && ((CM)vfn).getAggOpType() == CMOperator.AggregateOperationTypes.VARIANCE && (op.aggOp.correctionLocation == PartialAggregate.CorrectionLocationType.LASTFOURCOLUMNS || op.aggOp.correctionLocation == PartialAggregate.CorrectionLocationType.LASTFOURROWS) && (ifn instanceof ReduceAll || ifn instanceof ReduceCol || ifn instanceof ReduceRow)) {
            return AggType.VAR;
        }
        if (vfn instanceof Multiply && (ifn instanceof ReduceAll || ifn instanceof ReduceCol || ifn instanceof ReduceRow)) {
            return AggType.PROD;
        }
        if (vfn instanceof Builtin && (ifn instanceof ReduceAll || ifn instanceof ReduceCol || ifn instanceof ReduceRow)) {
            Builtin.BuiltinCode bfcode = ((Builtin)vfn).bFunc;
            switch (bfcode) {
                case MAX: {
                    return AggType.MAX;
                }
                case MIN: {
                    return AggType.MIN;
                }
                case MAXINDEX: {
                    return AggType.MAX_INDEX;
                }
                case MININDEX: {
                    return AggType.MIN_INDEX;
                }
            }
        }
        return AggType.INVALID;
    }

    private static AggType getAggType(UnaryOperator op) {
        ValueFunction vfn = op.fn;
        if (vfn instanceof Builtin) {
            Builtin.BuiltinCode bfunc = ((Builtin)vfn).bFunc;
            switch (bfunc) {
                case CUMSUM: {
                    return AggType.CUM_KAHAN_SUM;
                }
                case CUMPROD: {
                    return AggType.CUM_PROD;
                }
                case CUMMIN: {
                    return AggType.CUM_MIN;
                }
                case CUMMAX: {
                    return AggType.CUM_MAX;
                }
                case CUMSUMPROD: {
                    return AggType.CUM_SUM_PROD;
                }
            }
            return AggType.INVALID;
        }
        return AggType.INVALID;
    }

    private static void aggregateFinalResult(AggregateOperator aop, MatrixBlock out, MatrixBlock partout) {
        AggregateOperator laop = aop;
        if (aop.increOp.fn instanceof Mean) {
            laop = new AggregateOperator(0.0, KahanPlus.getKahanPlusFnObject(), aop.correctionExists, aop.correctionLocation);
        }
        if (laop.correctionExists) {
            out.incrementalAggregate(laop, partout);
        } else {
            out.binaryOperationsInPlace(laop.increOp, partout);
        }
    }

    private static void aggregateTernaryDense(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, MatrixBlock ret, IndexFunction ixFn, int rl, int ru) {
        KahanObject kbuff = new KahanObject(0.0, 0.0);
        KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
        double[] a = in1.getDenseBlockValues();
        double[] b1 = in2.getDenseBlockValues();
        double[] b2 = in3 != null ? in3.getDenseBlockValues() : null;
        int n = in1.clen;
        if (ixFn instanceof ReduceAll) {
            int ix = rl * n;
            for (int i = rl; i < ru; ++i) {
                int j = 0;
                while (j < n) {
                    double b2val = b2 != null ? b2[ix] : 1.0;
                    double val = a[ix] * b1[ix] * b2val;
                    kplus.execute2(kbuff, val);
                    ++j;
                    ++ix;
                }
            }
            ret.quickSetValue(0, 0, kbuff._sum);
            ret.quickSetValue(0, 1, kbuff._correction);
        } else {
            double[] c = ret.getDenseBlockValues();
            int ix = rl * n;
            for (int i = rl; i < ru; ++i) {
                int j = 0;
                while (j < n) {
                    double b2val = b2 != null ? b2[ix] : 1.0;
                    double val = a[ix] * b1[ix] * b2val;
                    kbuff._sum = c[j];
                    kbuff._correction = c[j + n];
                    kplus.execute2(kbuff, val);
                    c[j] = kbuff._sum;
                    c[j + n] = kbuff._correction;
                    ++j;
                    ++ix;
                }
            }
        }
    }

    private static void aggregateTernaryGeneric(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, MatrixBlock ret, IndexFunction ixFn, int rl, int ru) {
        KahanObject kbuff = new KahanObject(0.0, 0.0);
        KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
        MatrixBlock[] blocks = new MatrixBlock[]{in1, in2, in3};
        Arrays.sort(blocks, new Comparator<MatrixBlock>(){

            @Override
            public int compare(MatrixBlock o1, MatrixBlock o2) {
                long nnz1 = o1 != null && o1.sparse ? o1.nonZeros : Long.MAX_VALUE;
                long nnz2 = o2 != null && o2.sparse ? o2.nonZeros : Long.MAX_VALUE;
                return Long.compare(nnz1, nnz2);
            }
        });
        MatrixBlock lin1 = blocks[0];
        MatrixBlock lin2 = blocks[1];
        MatrixBlock lin3 = blocks[2];
        SparseBlock a = lin1.sparseBlock;
        int n = in1.clen;
        if (ixFn instanceof ReduceAll) {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                for (int j = apos; j < apos + alen; ++j) {
                    double val1 = avals[j];
                    double val2 = lin2.quickGetValue(i, aix[j]);
                    double val = val1 * val2;
                    if (val != 0.0 && lin3 != null) {
                        val *= lin3.quickGetValue(i, aix[j]);
                    }
                    kplus.execute2(kbuff, val);
                }
            }
            ret.quickSetValue(0, 0, kbuff._sum);
            ret.quickSetValue(0, 1, kbuff._correction);
        } else {
            double[] c = ret.getDenseBlockValues();
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                for (int j = apos; j < apos + alen; ++j) {
                    double val1 = avals[j];
                    int colIx = aix[j];
                    double val2 = lin2.quickGetValue(i, colIx);
                    double val = val1 * val2;
                    if (val != 0.0 && lin3 != null) {
                        val *= lin3.quickGetValue(i, colIx);
                    }
                    kbuff._sum = c[colIx];
                    kbuff._correction = c[colIx + n];
                    kplus.execute2(kbuff, val);
                    c[colIx] = kbuff._sum;
                    c[colIx + n] = kbuff._correction;
                }
            }
        }
    }

    private static void groupedAggregateKahanPlus(MatrixBlock groups, MatrixBlock target, MatrixBlock weights, MatrixBlock result, int numGroups, AggregateOperator aggop, int cl, int cu) {
        int i;
        int j;
        boolean rowVector = target.getNumRows() == 1 && target.getNumColumns() > 1;
        double w = 1.0;
        if (target.isEmptyBlock(false)) {
            return;
        }
        int numCols2 = cu - cl;
        KahanObject[][] buffer = new KahanObject[numGroups][numCols2];
        for (int i2 = 0; i2 < numGroups; ++i2) {
            for (j = 0; j < numCols2; ++j) {
                buffer[i2][j] = new KahanObject(aggop.initialValue, 0.0);
            }
        }
        if (rowVector) {
            if (target.sparse) {
                if (!target.sparseBlock.isEmpty(0)) {
                    int pos = target.sparseBlock.pos(0);
                    int len = target.sparseBlock.size(0);
                    int[] aix = target.sparseBlock.indexes(0);
                    double[] avals = target.sparseBlock.values(0);
                    for (int j2 = pos; j2 < pos + len; ++j2) {
                        int g = (int)groups.quickGetValue(aix[j2], 0);
                        if (g > numGroups) continue;
                        if (weights != null) {
                            w = weights.quickGetValue(aix[j2], 0);
                        }
                        aggop.increOp.fn.execute((Data)buffer[g - 1][0], avals[j2] * w);
                    }
                }
            } else {
                double[] a = target.getDenseBlockValues();
                for (i = 0; i < target.getNumColumns(); ++i) {
                    int g;
                    double d = a[i];
                    if (d == 0.0 || (g = (int)groups.quickGetValue(i, 0)) > numGroups) continue;
                    if (weights != null) {
                        w = weights.quickGetValue(i, 0);
                    }
                    aggop.increOp.fn.execute((Data)buffer[g - 1][0], d * w);
                }
            }
        } else if (target.sparse) {
            SparseBlock a = target.sparseBlock;
            for (i = 0; i < groups.getNumRows(); ++i) {
                int g = (int)groups.quickGetValue(i, 0);
                if (g > numGroups || a.isEmpty(i)) continue;
                int pos = a.pos(i);
                int len = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                int j3 = cl == 0 ? 0 : a.posFIndexGTE(i, cl);
                int n = j3 = j3 >= 0 ? pos + j3 : pos + len;
                while (j3 < pos + len && aix[j3] < cu) {
                    if (weights != null) {
                        w = weights.quickGetValue(aix[j3], 0);
                    }
                    aggop.increOp.fn.execute((Data)buffer[g - 1][aix[j3] - cl], avals[j3] * w);
                    ++j3;
                }
            }
        } else {
            DenseBlock a = target.getDenseBlock();
            for (i = 0; i < groups.getNumRows(); ++i) {
                int g = (int)groups.quickGetValue(i, 0);
                if (g > numGroups) continue;
                double[] avals = a.values(i);
                int aix = a.pos(i);
                for (int j4 = cl; j4 < cu; ++j4) {
                    double d = avals[aix + j4];
                    if (d == 0.0) continue;
                    if (weights != null) {
                        w = weights.quickGetValue(i, 0);
                    }
                    aggop.increOp.fn.execute((Data)buffer[g - 1][j4 - cl], d * w);
                }
            }
        }
        for (int i3 = 0; i3 < numGroups; ++i3) {
            for (j = 0; j < numCols2; ++j) {
                result.appendValue(i3, j + cl, buffer[i3][j]._sum);
            }
        }
    }

    private static void groupedAggregateCM(MatrixBlock groups, MatrixBlock target, MatrixBlock weights, MatrixBlock result, int numGroups, CMOperator cmOp, int cl, int cu) {
        Serializable a;
        CM cmFn = CM.getCMFnObject(cmOp.getAggOpType());
        double w = 1.0;
        int numCols2 = cu - cl;
        CM_COV_Object[][] cmValues = new CM_COV_Object[numGroups][numCols2];
        for (int i = 0; i < numGroups; ++i) {
            for (int j = 0; j < numCols2; ++j) {
                cmValues[i][j] = new CM_COV_Object();
            }
        }
        if (target.sparse) {
            a = target.sparseBlock;
            SpoofOperator.SideInputSparseCell sa = new SpoofOperator.SideInputSparseCell(new SpoofOperator.SideInput(null, target, target.clen));
            for (int i = 0; i < groups.getNumRows(); ++i) {
                int j;
                int g = (int)groups.quickGetValue(i, 0);
                if (g > numGroups) continue;
                if (((SparseBlock)a).isEmpty(i)) {
                    w = weights != null ? weights.quickGetValue(i, 0) : w;
                    for (j = cl; j < cu; ++j) {
                        cmFn.execute(cmValues[g - 1][j - cl], 0.0, w);
                    }
                    continue;
                }
                for (j = cl; j < cu; ++j) {
                    double d = sa.getValue(i, j);
                    if (weights != null) {
                        w = weights.quickGetValue(i, 0);
                    }
                    cmFn.execute(cmValues[g - 1][j - cl], d, w);
                }
            }
        } else {
            a = target.getDenseBlock();
            for (int i = 0; i < groups.getNumRows(); ++i) {
                int g = (int)groups.quickGetValue(i, 0);
                if (g > numGroups) continue;
                double[] avals = ((DenseBlock)a).values(i);
                int aix = ((DenseBlock)a).pos(i);
                for (int j = cl; j < cu; ++j) {
                    double d = avals[aix + j];
                    if (weights != null) {
                        w = weights.quickGetValue(i, 0);
                    }
                    cmFn.execute(cmValues[g - 1][j - cl], d, w);
                }
            }
        }
        for (int i = 0; i < numGroups; ++i) {
            for (int j = 0; j < numCols2; ++j) {
                result.appendValue(i, j, cmValues[i][j + cl].getRequiredResult(cmOp));
            }
        }
    }

    private static void groupedAggregateVecCount(MatrixBlock groups, MatrixBlock result, int numGroups) {
        int i;
        if (groups.isInSparseFormat() || groups.isEmptyBlock(false)) {
            throw new DMLRuntimeException("Unsupported sparse input for aggregate-count on group vector.");
        }
        double[] a = groups.getDenseBlockValues();
        int[] tmp = new int[numGroups];
        int m = groups.rlen;
        for (i = 0; i < m; ++i) {
            int g = (int)a[i];
            if (g > numGroups) continue;
            int n = g - 1;
            tmp[n] = tmp[n] + 1;
        }
        for (i = 0; i < numGroups; ++i) {
            result.appendValue(i, 0, tmp[i]);
        }
    }

    private static void aggregateBinaryMatrixAllDense(MatrixBlock in, MatrixBlock aggVal, MatrixBlock aggCorr) {
        aggVal.allocateDenseBlock();
        aggCorr.allocateDenseBlock();
        double[] a = in.getDenseBlockValues();
        double[] c = aggVal.getDenseBlockValues();
        double[] cc = aggCorr.getDenseBlockValues();
        KahanObject buffer1 = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int len = Math.min(a.length, in.rlen * in.clen);
        int nnzC = 0;
        int nnzCC = 0;
        for (int i = 0; i < len; ++i) {
            buffer1._sum = c[i];
            buffer1._correction = cc[i];
            akplus.execute2(buffer1, a[i]);
            c[i] = buffer1._sum;
            cc[i] = buffer1._correction;
            nnzC += buffer1._sum != 0.0 ? 1 : 0;
            nnzCC += buffer1._correction != 0.0 ? 1 : 0;
        }
        aggVal.nonZeros = nnzC;
        aggCorr.nonZeros = nnzCC;
    }

    private static void aggregateBinaryMatrixSparseDense(MatrixBlock in, MatrixBlock aggVal, MatrixBlock aggCorr) {
        aggVal.allocateDenseBlock();
        aggCorr.allocateDenseBlock();
        SparseBlock a = in.getSparseBlock();
        double[] c = aggVal.getDenseBlockValues();
        double[] cc = aggCorr.getDenseBlockValues();
        KahanObject buffer1 = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int m = in.rlen;
        int n = in.clen;
        int rlen = Math.min(a.numRows(), m);
        int i = 0;
        int cix = 0;
        while (i < rlen) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                for (int j = apos; j < apos + alen; ++j) {
                    int ix = cix + aix[j];
                    buffer1._sum = c[ix];
                    buffer1._correction = cc[ix];
                    akplus.execute2(buffer1, avals[j]);
                    c[ix] = buffer1._sum;
                    cc[ix] = buffer1._correction;
                }
            }
            ++i;
            cix += n;
        }
        aggVal.recomputeNonZeros();
        aggCorr.recomputeNonZeros();
    }

    private static void aggregateBinaryMatrixSparseGeneric(MatrixBlock in, MatrixBlock aggVal, MatrixBlock aggCorr) {
        SparseBlock a = in.getSparseBlock();
        KahanObject buffer1 = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int m = in.rlen;
        int rlen = Math.min(a.numRows(), m);
        for (int i = 0; i < rlen; ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            for (int j = apos; j < apos + alen; ++j) {
                int jix = aix[j];
                buffer1._sum = aggVal.quickGetValue(i, jix);
                buffer1._correction = aggCorr.quickGetValue(i, jix);
                akplus.execute2(buffer1, avals[j]);
                aggVal.quickSetValue(i, jix, buffer1._sum);
                aggCorr.quickSetValue(i, jix, buffer1._correction);
            }
        }
        if (aggVal.sparse) {
            aggVal.examSparsity(false);
        }
        if (aggCorr.sparse) {
            aggCorr.examSparsity(false);
        }
    }

    private static void aggregateBinaryMatrixDenseGeneric(MatrixBlock in, MatrixBlock aggVal, MatrixBlock aggCorr) {
        int m = in.rlen;
        int n = in.clen;
        double[] a = in.getDenseBlockValues();
        KahanObject buffer = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int ix = 0;
        for (int i = 0; i < m; ++i) {
            int j = 0;
            while (j < n) {
                buffer._sum = aggVal.quickGetValue(i, j);
                buffer._correction = aggCorr.quickGetValue(i, j);
                akplus.execute((Data)buffer, a[ix]);
                aggVal.quickSetValue(i, j, buffer._sum);
                aggCorr.quickSetValue(i, j, buffer._correction);
                ++j;
                ++ix;
            }
        }
        if (aggVal.sparse) {
            aggVal.examSparsity(false);
        }
        if (aggCorr.sparse) {
            aggCorr.examSparsity(false);
        }
    }

    private static void aggregateBinaryMatrixLastRowDenseGeneric(MatrixBlock in, MatrixBlock aggVal) {
        if (in.denseBlock == null || in.isEmptyBlock(false)) {
            return;
        }
        int m = in.rlen;
        int n = in.clen;
        int cix = (m - 1) * n;
        double[] a = in.getDenseBlockValues();
        KahanObject buffer = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int ix = 0;
        for (int i = 0; i < m - 1; ++i) {
            int j = 0;
            while (j < n) {
                buffer._sum = aggVal.quickGetValue(i, j);
                buffer._correction = aggVal.quickGetValue(m - 1, j);
                akplus.execute(buffer, a[ix], a[cix + j]);
                aggVal.quickSetValue(i, j, buffer._sum);
                aggVal.quickSetValue(m - 1, j, buffer._correction);
                ++j;
                ++ix;
            }
        }
        aggVal.examSparsity();
    }

    private static void aggregateBinaryMatrixLastRowSparseGeneric(MatrixBlock in, MatrixBlock aggVal) {
        if (in.isEmptyBlock(false)) {
            return;
        }
        SparseBlock a = in.getSparseBlock();
        KahanObject buffer1 = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int m = in.rlen;
        int rlen = Math.min(a.numRows(), m);
        for (int i = 0; i < rlen - 1; ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            for (int j = apos; j < apos + alen; ++j) {
                int jix = aix[j];
                double corr = in.quickGetValue(m - 1, jix);
                buffer1._sum = aggVal.quickGetValue(i, jix);
                buffer1._correction = aggVal.quickGetValue(m - 1, jix);
                akplus.execute(buffer1, avals[j], corr);
                aggVal.quickSetValue(i, jix, buffer1._sum);
                aggVal.quickSetValue(m - 1, jix, buffer1._correction);
            }
        }
        aggVal.examSparsity();
    }

    private static void aggregateBinaryMatrixLastColDenseGeneric(MatrixBlock in, MatrixBlock aggVal) {
        if (in.denseBlock == null || in.isEmptyBlock(false)) {
            return;
        }
        int m = in.rlen;
        int n = in.clen;
        double[] a = in.getDenseBlockValues();
        KahanObject buffer = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int i = 0;
        int ix = 0;
        while (i < m) {
            for (int j = 0; j < n - 1; ++j) {
                buffer._sum = aggVal.quickGetValue(i, j);
                buffer._correction = aggVal.quickGetValue(i, n - 1);
                akplus.execute(buffer, a[ix + j], a[ix + j + 1]);
                aggVal.quickSetValue(i, j, buffer._sum);
                aggVal.quickSetValue(i, n - 1, buffer._correction);
            }
            ++i;
            ix += n;
        }
        aggVal.examSparsity();
    }

    private static void aggregateBinaryMatrixLastColSparseGeneric(MatrixBlock in, MatrixBlock aggVal) {
        if (in.isEmptyBlock(false)) {
            return;
        }
        SparseBlock a = in.getSparseBlock();
        KahanObject buffer1 = new KahanObject(0.0, 0.0);
        KahanPlus akplus = KahanPlus.getKahanPlusFnObject();
        int m = in.rlen;
        int n = in.clen;
        int rlen = Math.min(a.numRows(), m);
        for (int i = 0; i < rlen; ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            for (int j = apos; j < apos + alen && aix[j] < n - 1; ++j) {
                int jix = aix[j];
                double corr = in.quickGetValue(i, n - 1);
                buffer1._sum = aggVal.quickGetValue(i, jix);
                buffer1._correction = aggVal.quickGetValue(i, n - 1);
                akplus.execute(buffer1, avals[j], corr);
                aggVal.quickSetValue(i, jix, buffer1._sum);
                aggVal.quickSetValue(i, n - 1, buffer1._correction);
            }
        }
        aggVal.examSparsity();
    }

    private static void aggregateUnaryMatrixDense(MatrixBlock in, MatrixBlock out, AggType optype, ValueFunction vFn, IndexFunction ixFn, int rl, int ru) {
        int n = in.clen;
        DenseBlock a = in.getDenseBlock();
        DenseBlock c = out.getDenseBlock();
        switch (optype) {
            case KAHAN_SUM: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.d_uakp(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.d_uarkp(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceRow) {
                    LibMatrixAgg.d_uackp(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceDiag)) break;
                LibMatrixAgg.d_uakptrace(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                break;
            }
            case KAHAN_SUM_SQ: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.d_uasqkp(a, c, n, kbuff, (KahanPlusSq)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.d_uarsqkp(a, c, n, kbuff, (KahanPlusSq)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.d_uacsqkp(a, c, n, kbuff, (KahanPlusSq)vFn, rl, ru);
                break;
            }
            case CUM_KAHAN_SUM: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
                LibMatrixAgg.d_ucumkp(in.getDenseBlock(), null, out.getDenseBlock(), n, kbuff, kplus, rl, ru);
                break;
            }
            case CUM_PROD: {
                LibMatrixAgg.d_ucumm(in.getDenseBlockValues(), null, out.getDenseBlockValues(), n, rl, ru);
                break;
            }
            case CUM_MIN: 
            case CUM_MAX: {
                double init = optype == AggType.CUM_MAX ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                LibMatrixAgg.d_ucummxx(in.getDenseBlockValues(), null, out.getDenseBlockValues(), n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MIN: 
            case MAX: {
                double init;
                double d = init = optype == AggType.MAX ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.d_uamxx(a, c, n, init, (Builtin)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.d_uarmxx(a, c, n, init, (Builtin)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.d_uacmxx(a, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MAX_INDEX: {
                double init = Double.NEGATIVE_INFINITY;
                if (!(ixFn instanceof ReduceCol)) break;
                LibMatrixAgg.d_uarimax(a, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MIN_INDEX: {
                double init = Double.POSITIVE_INFINITY;
                if (!(ixFn instanceof ReduceCol)) break;
                LibMatrixAgg.d_uarimin(a, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MEAN: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.d_uamean(a, c, n, kbuff, (Mean)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.d_uarmean(a, c, n, kbuff, (Mean)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.d_uacmean(a, c, n, kbuff, (Mean)vFn, rl, ru);
                break;
            }
            case VAR: {
                CM_COV_Object cbuff = new CM_COV_Object();
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.d_uavar(a, c, n, cbuff, (CM)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.d_uarvar(a, c, n, cbuff, (CM)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.d_uacvar(a, c, n, cbuff, (CM)vFn, rl, ru);
                break;
            }
            case PROD: {
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.d_uam(a, c, n, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.d_uarm(a, c, n, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.d_uacm(a, c, n, rl, ru);
                break;
            }
            default: {
                throw new DMLRuntimeException("Unsupported aggregation type: " + (Object)((Object)optype));
            }
        }
    }

    private static void aggregateUnaryMatrixSparse(MatrixBlock in, MatrixBlock out, AggType optype, ValueFunction vFn, IndexFunction ixFn, int rl, int ru) {
        int m = in.rlen;
        int n = in.clen;
        SparseBlock a = in.getSparseBlock();
        DenseBlock c = out.getDenseBlock();
        switch (optype) {
            case KAHAN_SUM: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.s_uakp(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.s_uarkp(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceRow) {
                    LibMatrixAgg.s_uackp(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceDiag)) break;
                LibMatrixAgg.s_uakptrace(a, c, n, kbuff, (KahanPlus)vFn, rl, ru);
                break;
            }
            case KAHAN_SUM_SQ: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.s_uasqkp(a, c, n, kbuff, (KahanPlusSq)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.s_uarsqkp(a, c, n, kbuff, (KahanPlusSq)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.s_uacsqkp(a, c, n, kbuff, (KahanPlusSq)vFn, rl, ru);
                break;
            }
            case CUM_KAHAN_SUM: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
                LibMatrixAgg.s_ucumkp(a, null, out.getDenseBlock(), m, n, kbuff, kplus, rl, ru);
                break;
            }
            case CUM_PROD: {
                LibMatrixAgg.s_ucumm(a, null, out.getDenseBlockValues(), n, rl, ru);
                break;
            }
            case CUM_MIN: 
            case CUM_MAX: {
                double init = optype == AggType.CUM_MAX ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                LibMatrixAgg.s_ucummxx(a, null, out.getDenseBlockValues(), n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MIN: 
            case MAX: {
                double init;
                double d = init = optype == AggType.MAX ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.s_uamxx(a, c, n, init, (Builtin)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.s_uarmxx(a, c, n, init, (Builtin)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.s_uacmxx(a, c, m, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MAX_INDEX: {
                double init = Double.NEGATIVE_INFINITY;
                if (!(ixFn instanceof ReduceCol)) break;
                LibMatrixAgg.s_uarimax(a, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MIN_INDEX: {
                double init = Double.POSITIVE_INFINITY;
                if (!(ixFn instanceof ReduceCol)) break;
                LibMatrixAgg.s_uarimin(a, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            case MEAN: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.s_uamean(a, c, n, kbuff, (Mean)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.s_uarmean(a, c, n, kbuff, (Mean)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.s_uacmean(a, c, n, kbuff, (Mean)vFn, rl, ru);
                break;
            }
            case VAR: {
                CM_COV_Object cbuff = new CM_COV_Object();
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.s_uavar(a, c, n, cbuff, (CM)vFn, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.s_uarvar(a, c, n, cbuff, (CM)vFn, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.s_uacvar(a, c, n, cbuff, (CM)vFn, rl, ru);
                break;
            }
            case PROD: {
                if (ixFn instanceof ReduceAll) {
                    LibMatrixAgg.s_uam(a, c, n, rl, ru);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    LibMatrixAgg.s_uarm(a, c, n, rl, ru);
                    break;
                }
                if (!(ixFn instanceof ReduceRow)) break;
                LibMatrixAgg.s_uacm(a, c, n, rl, ru);
                break;
            }
            default: {
                throw new DMLRuntimeException("Unsupported aggregation type: " + (Object)((Object)optype));
            }
        }
    }

    private static void cumaggregateUnaryMatrixDense(MatrixBlock in, MatrixBlock out, AggType optype, ValueFunction vFn, double[] agg, int rl, int ru) {
        int n = in.clen;
        DenseBlock da = in.getDenseBlock();
        DenseBlock dc = out.getDenseBlock();
        double[] a = in.getDenseBlockValues();
        double[] c = out.getDenseBlockValues();
        switch (optype) {
            case CUM_KAHAN_SUM: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
                LibMatrixAgg.d_ucumkp(da, agg, dc, n, kbuff, kplus, rl, ru);
                break;
            }
            case CUM_SUM_PROD: {
                if (n != 2) {
                    throw new DMLRuntimeException("Cumsumprod expects two-column input (n=" + n + ").");
                }
                LibMatrixAgg.d_ucumkpp(da, agg, dc, rl, ru);
                break;
            }
            case CUM_PROD: {
                LibMatrixAgg.d_ucumm(a, agg, c, n, rl, ru);
                break;
            }
            case CUM_MIN: 
            case CUM_MAX: {
                double init = optype == AggType.CUM_MAX ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                LibMatrixAgg.d_ucummxx(a, agg, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            default: {
                throw new DMLRuntimeException("Unsupported cumulative aggregation type: " + (Object)((Object)optype));
            }
        }
    }

    private static void cumaggregateUnaryMatrixSparse(MatrixBlock in, MatrixBlock out, AggType optype, ValueFunction vFn, double[] agg, int rl, int ru) {
        int m = in.rlen;
        int n = in.clen;
        SparseBlock a = in.getSparseBlock();
        DenseBlock dc = out.getDenseBlock();
        double[] c = out.getDenseBlockValues();
        switch (optype) {
            case CUM_KAHAN_SUM: {
                KahanObject kbuff = new KahanObject(0.0, 0.0);
                KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
                LibMatrixAgg.s_ucumkp(a, agg, dc, m, n, kbuff, kplus, rl, ru);
                break;
            }
            case CUM_PROD: {
                LibMatrixAgg.s_ucumm(a, agg, c, n, rl, ru);
                break;
            }
            case CUM_MIN: 
            case CUM_MAX: {
                double init = optype == AggType.CUM_MAX ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                LibMatrixAgg.s_ucummxx(a, agg, c, n, init, (Builtin)vFn, rl, ru);
                break;
            }
            default: {
                throw new DMLRuntimeException("Unsupported cumulative aggregation type: " + (Object)((Object)optype));
            }
        }
    }

    private static MatrixBlock aggregateUnaryMatrixEmpty(MatrixBlock in, MatrixBlock out, AggType optype, IndexFunction ixFn) {
        if (ixFn instanceof ReduceAll && (in.getNumRows() == 0 || in.getNumColumns() == 0)) {
            double val = Double.NaN;
            switch (optype) {
                case KAHAN_SUM: 
                case KAHAN_SUM_SQ: {
                    val = 0.0;
                    break;
                }
                case MIN: {
                    val = Double.POSITIVE_INFINITY;
                    break;
                }
                case MAX: {
                    val = Double.NEGATIVE_INFINITY;
                    break;
                }
                default: {
                    val = Double.NaN;
                }
            }
            out.quickSetValue(0, 0, val);
            return out;
        }
        if (optype == AggType.KAHAN_SUM || optype == AggType.KAHAN_SUM_SQ || optype == AggType.MIN || optype == AggType.MAX || optype == AggType.PROD || optype == AggType.CUM_KAHAN_SUM || optype == AggType.CUM_PROD || optype == AggType.CUM_MIN || optype == AggType.CUM_MAX) {
            return out;
        }
        switch (optype) {
            case MAX_INDEX: {
                if (!(ixFn instanceof ReduceCol)) break;
                for (int i = 0; i < out.rlen; ++i) {
                    out.quickSetValue(i, 0, in.clen);
                }
                break;
            }
            case MIN_INDEX: {
                if (!(ixFn instanceof ReduceCol)) break;
                for (int i = 0; i < out.rlen; ++i) {
                    out.quickSetValue(i, 0, in.clen);
                }
                break;
            }
            case MEAN: {
                if (ixFn instanceof ReduceAll) {
                    out.quickSetValue(0, 1, in.rlen * in.clen);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    for (int i = 0; i < in.rlen; ++i) {
                        out.quickSetValue(i, 1, in.clen);
                    }
                } else {
                    if (!(ixFn instanceof ReduceRow)) break;
                    for (int j = 0; j < in.clen; ++j) {
                        out.quickSetValue(1, j, in.rlen);
                    }
                }
                break;
            }
            case VAR: {
                if (ixFn instanceof ReduceAll) {
                    out.quickSetValue(0, 2, in.rlen * in.clen);
                    break;
                }
                if (ixFn instanceof ReduceCol) {
                    for (int i = 0; i < in.rlen; ++i) {
                        out.quickSetValue(i, 2, in.clen);
                    }
                } else {
                    if (!(ixFn instanceof ReduceRow)) break;
                    for (int j = 0; j < in.clen; ++j) {
                        out.quickSetValue(2, j, in.rlen);
                    }
                }
                break;
            }
            default: {
                throw new DMLRuntimeException("Unsupported aggregation type: " + (Object)((Object)optype));
            }
        }
        return out;
    }

    private static void d_uakp(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        int bil = a.index(rl);
        int biu = a.index(ru - 1);
        for (int bi = bil; bi <= biu; ++bi) {
            int lpos = bi == bil ? a.pos(rl) : 0;
            int len = bi == biu ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
            LibMatrixAgg.sum(a.valuesAt(bi), lpos, len, kbuff, kplus);
        }
        c.set(kbuff);
    }

    private static void d_uarkp(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            kbuff.set(0.0, 0.0);
            LibMatrixAgg.sum(a.values(i), a.pos(i), n, kbuff, kplus);
            c.set(i, kbuff);
        }
    }

    private static void d_uackp(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            LibMatrixAgg.sumAgg(a.values(i), c, a.pos(i), n, kbuff, kplus);
        }
    }

    private static void d_uasqkp(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlusSq kplusSq, int rl, int ru) {
        int bil = a.index(rl);
        int biu = a.index(ru - 1);
        for (int bi = bil; bi <= biu; ++bi) {
            int lpos = bi == bil ? a.pos(rl) : 0;
            int len = bi == biu ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
            LibMatrixAgg.sum(a.valuesAt(bi), lpos, len, kbuff, kplusSq);
        }
        c.set(kbuff);
    }

    private static void d_uarsqkp(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlusSq kplusSq, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            kbuff.set(0.0, 0.0);
            LibMatrixAgg.sum(a.values(i), a.pos(i), n, kbuff, kplusSq);
            c.set(i, kbuff);
        }
    }

    private static void d_uacsqkp(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlusSq kplusSq, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            LibMatrixAgg.sumAgg(a.values(i), c, a.pos(i), n, kbuff, kplusSq);
        }
    }

    private static void d_ucumkp(DenseBlock a, double[] agg, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        DenseBlock csums = DenseBlockFactory.createDenseBlock(2, n);
        if (agg != null) {
            csums.set(0, agg);
        }
        for (int i = rl; i < ru; ++i) {
            LibMatrixAgg.sumAgg(a.values(i), csums, a.pos(i), n, kbuff, kplus);
            c.set(i, csums.values(0));
        }
    }

    private static void d_ucumkpp(DenseBlock a, double[] agg, DenseBlock c, int rl, int ru) {
        double sum = agg != null ? agg[0] : 0.0;
        double[] avals = a.valuesAt(0);
        double[] cvals = c.valuesAt(0);
        int i = rl;
        int ix = rl * 2;
        while (i < ru) {
            sum = cvals[i] = avals[ix] + avals[ix + 1] * sum;
            ++i;
            ix += 2;
        }
    }

    private static void d_ucumm(double[] a, double[] agg, double[] c, int n, int rl, int ru) {
        double[] cprods;
        double[] dArray = cprods = agg != null ? agg : new double[n];
        if (agg == null) {
            Arrays.fill(cprods, 1.0);
        }
        int i = rl;
        int aix = rl * n;
        while (i < ru) {
            LibMatrixAgg.productAgg(a, cprods, aix, 0, n);
            System.arraycopy(cprods, 0, c, aix, n);
            ++i;
            aix += n;
        }
    }

    private static void d_ucummxx(double[] a, double[] agg, double[] c, int n, double init, Builtin builtin, int rl, int ru) {
        double[] cmxx;
        double[] dArray = cmxx = agg != null ? agg : new double[n];
        if (agg == null) {
            Arrays.fill(cmxx, init);
        }
        int i = rl;
        int aix = rl * n;
        while (i < ru) {
            LibMatrixAgg.builtinAgg(a, cmxx, aix, n, builtin);
            System.arraycopy(cmxx, 0, c, aix, n);
            ++i;
            aix += n;
        }
    }

    private static void d_uakptrace(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            kplus.execute2(kbuff, a.get(i, i));
        }
        c.set(kbuff);
    }

    private static void d_uamxx(DenseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        double tmp = init;
        int bil = a.index(rl);
        int biu = a.index(ru - 1);
        for (int bi = bil; bi <= biu; ++bi) {
            int lpos = bi == bil ? a.pos(rl) : 0;
            int len = bi == biu ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
            tmp = LibMatrixAgg.builtin(a.valuesAt(bi), lpos, tmp, len, builtin);
        }
        c.set(0, 0, tmp);
    }

    private static void d_uarmxx(DenseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            c.set(i, 0, LibMatrixAgg.builtin(a.values(i), a.pos(i), init, n, builtin));
        }
    }

    private static void d_uacmxx(DenseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        c.set(init);
        double[] lc = c.valuesAt(0);
        for (int i = rl; i < ru; ++i) {
            LibMatrixAgg.builtinAgg(a.values(i), lc, a.pos(i), n, builtin);
        }
    }

    private static void d_uarimax(DenseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        if (n <= 0) {
            throw new DMLRuntimeException("rowIndexMax undefined for ncol=" + n);
        }
        for (int i = rl; i < ru; ++i) {
            int maxindex = LibMatrixAgg.indexmax(a.values(i), a.pos(i), init, n, builtin);
            c.set(i, 0, (double)maxindex + 1.0);
            c.set(i, 1, a.get(i, maxindex));
        }
    }

    private static void d_uarimin(DenseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        if (n <= 0) {
            throw new DMLRuntimeException("rowIndexMin undefined for ncol=" + n);
        }
        for (int i = rl; i < ru; ++i) {
            int minindex = LibMatrixAgg.indexmin(a.values(i), a.pos(i), init, n, builtin);
            c.set(i, 0, (double)minindex + 1.0);
            c.set(i, 1, a.get(i, minindex));
        }
    }

    private static void d_uamean(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, Mean kmean, int rl, int ru) {
        int bil = a.index(rl);
        int biu = a.index(ru - 1);
        int tlen = 0;
        for (int bi = bil; bi <= biu; ++bi) {
            int lpos = bi == bil ? a.pos(rl) : 0;
            int len = bi == biu ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
            LibMatrixAgg.mean(a.valuesAt(bi), lpos, len, 0, kbuff, kmean);
            tlen += len;
        }
        c.set(0, 0, kbuff._sum);
        c.set(0, 1, tlen);
        c.set(0, 2, kbuff._correction);
    }

    private static void d_uarmean(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, Mean kmean, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            kbuff.set(0.0, 0.0);
            LibMatrixAgg.mean(a.values(i), a.pos(i), n, 0, kbuff, kmean);
            c.set(i, 0, kbuff._sum);
            c.set(i, 1, n);
            c.set(i, 2, kbuff._correction);
        }
    }

    private static void d_uacmean(DenseBlock a, DenseBlock c, int n, KahanObject kbuff, Mean kmean, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            LibMatrixAgg.meanAgg(a.values(i), c, a.pos(i), n, kbuff, kmean);
        }
    }

    private static void d_uavar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
        int bil = a.index(rl);
        int biu = a.index(ru - 1);
        for (int bi = bil; bi <= biu; ++bi) {
            int lpos = bi == bil ? a.pos(rl) : 0;
            int len = bi == biu ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
            LibMatrixAgg.var(a.valuesAt(bi), lpos, len, cbuff, cm);
        }
        c.set(0, 0, cbuff.getRequiredResult(CMOperator.AggregateOperationTypes.VARIANCE));
        c.set(0, 1, cbuff.mean._sum);
        c.set(0, 2, cbuff.w);
        c.set(0, 3, cbuff.m2._correction);
        c.set(0, 4, cbuff.mean._correction);
    }

    private static void d_uarvar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            cbuff.reset();
            LibMatrixAgg.var(a.values(i), a.pos(i), n, cbuff, cm);
            c.set(i, 0, cbuff.getRequiredResult(CMOperator.AggregateOperationTypes.VARIANCE));
            c.set(i, 1, cbuff.mean._sum);
            c.set(i, 2, cbuff.w);
            c.set(i, 3, cbuff.m2._correction);
            c.set(i, 4, cbuff.mean._correction);
        }
    }

    private static void d_uacvar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            LibMatrixAgg.varAgg(a.values(i), c, a.pos(i), n, cbuff, cm);
        }
    }

    private static void d_uam(DenseBlock a, DenseBlock c, int n, int rl, int ru) {
        int bil = a.index(rl);
        int biu = a.index(ru - 1);
        double tmp = 1.0;
        for (int bi = bil; bi <= biu; ++bi) {
            int lpos = bi == bil ? a.pos(rl) : 0;
            int len = bi == biu ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
            tmp *= LibMatrixAgg.product(a.valuesAt(bi), lpos, len);
        }
        c.set(0, 0, tmp);
    }

    private static void d_uarm(DenseBlock a, DenseBlock c, int n, int rl, int ru) {
        double[] lc = c.valuesAt(0);
        for (int i = rl; i < ru; ++i) {
            lc[i] = LibMatrixAgg.product(a.values(i), a.pos(i), n);
        }
    }

    private static void d_uacm(DenseBlock a, DenseBlock c, int n, int rl, int ru) {
        double[] lc = c.set(1.0).valuesAt(0);
        for (int i = rl; i < ru; ++i) {
            LibMatrixMult.vectMultiplyWrite(a.values(i), lc, lc, a.pos(i), 0, 0, n);
        }
    }

    private static void s_uakp(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        if (a.isContiguous()) {
            LibMatrixAgg.sum(a.values(rl), a.pos(rl), (int)a.size(rl, ru), kbuff, kplus);
        } else {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.sum(a.values(i), a.pos(i), a.size(i), kbuff, kplus);
            }
        }
        c.set(kbuff);
    }

    private static void s_uarkp(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            kbuff.set(0.0, 0.0);
            LibMatrixAgg.sum(a.values(i), a.pos(i), a.size(i), kbuff, kplus);
            c.set(i, kbuff);
        }
    }

    private static void s_uackp(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        if (a.isContiguous()) {
            LibMatrixAgg.sumAgg(a.values(rl), c, a.indexes(rl), a.pos(rl), (int)a.size(rl, ru), n, kbuff, kplus);
        } else {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.sumAgg(a.values(i), c, a.indexes(i), a.pos(i), a.size(i), n, kbuff, kplus);
            }
        }
    }

    private static void s_uasqkp(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlusSq kplusSq, int rl, int ru) {
        if (a.isContiguous()) {
            LibMatrixAgg.sum(a.values(rl), a.pos(rl), (int)a.size(rl, ru), kbuff, kplusSq);
        } else {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.sum(a.values(i), a.pos(i), a.size(i), kbuff, kplusSq);
            }
        }
        c.set(kbuff);
    }

    private static void s_uarsqkp(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlusSq kplusSq, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            kbuff.set(0.0, 0.0);
            LibMatrixAgg.sum(a.values(i), a.pos(i), a.size(i), kbuff, kplusSq);
            c.set(i, kbuff);
        }
    }

    private static void s_uacsqkp(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlusSq kplusSq, int rl, int ru) {
        if (a.isContiguous()) {
            LibMatrixAgg.sumAgg(a.values(rl), c, a.indexes(rl), a.pos(rl), (int)a.size(rl, ru), n, kbuff, kplusSq);
        } else {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.sumAgg(a.values(i), c, a.indexes(i), a.pos(i), a.size(i), n, kbuff, kplusSq);
            }
        }
    }

    private static void s_ucumkp(SparseBlock a, double[] agg, DenseBlock c, int m, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        DenseBlock csums = DenseBlockFactory.createDenseBlock(2, n);
        if (agg != null) {
            csums.set(0, agg);
        }
        for (int i = rl; i < ru; ++i) {
            if (!a.isEmpty(i)) {
                LibMatrixAgg.sumAgg(a.values(i), csums, a.indexes(i), a.pos(i), a.size(i), n, kbuff, kplus);
            }
            c.set(i, csums.values(0));
        }
    }

    private static void s_ucumm(SparseBlock a, double[] agg, double[] c, int n, int rl, int ru) {
        double[] cprod;
        double[] dArray = cprod = agg != null ? agg : new double[n];
        if (agg == null) {
            Arrays.fill(cprod, 1.0);
        }
        int[] cnt = new int[n];
        int i = rl;
        int ix = rl * n;
        while (i < ru) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                LibMatrixAgg.productAgg(avals, cprod, aix, apos, 0, alen);
                LibMatrixAgg.countAgg(avals, cnt, aix, apos, alen);
            }
            for (int j = 0; j < n; ++j) {
                if (cnt[j] >= i + 1) continue;
                int n2 = j;
                cprod[n2] = cprod[n2] * 0.0;
            }
            System.arraycopy(cprod, 0, c, ix, n);
            ++i;
            ix += n;
        }
    }

    private static void s_ucummxx(SparseBlock a, double[] agg, double[] c, int n, double init, Builtin builtin, int rl, int ru) {
        double[] cmxx;
        double[] dArray = cmxx = agg != null ? agg : new double[n];
        if (agg == null) {
            Arrays.fill(cmxx, init);
        }
        int[] cnt = new int[n];
        int i = rl;
        int ix = rl * n;
        while (i < ru) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                LibMatrixAgg.builtinAgg(avals, cmxx, aix, apos, alen, builtin);
                LibMatrixAgg.countAgg(avals, cnt, aix, apos, alen);
            }
            for (int j = 0; j < n; ++j) {
                if (cnt[j] >= i + 1) continue;
                cmxx[j] = builtin.execute(cmxx[j], 0.0);
            }
            System.arraycopy(cmxx, 0, c, ix, n);
            ++i;
            ix += n;
        }
    }

    private static void s_uakptrace(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            kplus.execute2(kbuff, a.get(i, i));
        }
        c.set(kbuff);
    }

    private static void s_uamxx(SparseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        double ret = init;
        if (a.isContiguous()) {
            int alen = (int)a.size(rl, ru);
            double val = LibMatrixAgg.builtin(a.values(rl), a.pos(rl), init, alen, builtin);
            ret = builtin.execute(ret, val);
            ret = alen < (ru - rl) * n ? builtin.execute(ret, 0.0) : ret;
        } else {
            for (int i = rl; i < ru; ++i) {
                if (!a.isEmpty(i)) {
                    double lval = LibMatrixAgg.builtin(a.values(i), a.pos(i), init, a.size(i), builtin);
                    ret = builtin.execute(ret, lval);
                }
                if (a.size(i) >= n) continue;
                ret = builtin.execute(ret, 0.0);
            }
        }
        c.set(0, 0, ret);
    }

    private static void s_uarmxx(SparseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        c.set(rl, ru, 0, 1, init);
        for (int i = rl; i < ru; ++i) {
            if (!a.isEmpty(i)) {
                c.set(i, 0, LibMatrixAgg.builtin(a.values(i), a.pos(i), init, a.size(i), builtin));
            }
            if (a.size(i) >= n) continue;
            c.set(i, 0, builtin.execute(c.get(i, 0), 0.0));
        }
    }

    private static void s_uacmxx(SparseBlock a, DenseBlock dc, int m, int n, double init, Builtin builtin, int rl, int ru) {
        int i;
        dc.set(init);
        double[] c = dc != null ? dc.valuesAt(0) : null;
        int[] cnt = new int[n];
        if (a.isContiguous()) {
            int alen = (int)a.size(rl, ru);
            LibMatrixAgg.builtinAgg(a.values(rl), c, a.indexes(rl), a.pos(rl), alen, builtin);
            LibMatrixAgg.countAgg(a.values(rl), cnt, a.indexes(rl), a.pos(rl), alen);
        } else {
            for (i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                int apos = a.pos(i);
                int alen = a.size(i);
                double[] avals = a.values(i);
                int[] aix = a.indexes(i);
                LibMatrixAgg.builtinAgg(avals, c, aix, apos, alen, builtin);
                LibMatrixAgg.countAgg(avals, cnt, aix, apos, alen);
            }
        }
        for (i = 0; i < n; ++i) {
            if (cnt[i] >= ru - rl) continue;
            c[i] = builtin.execute(c[i], 0.0);
        }
    }

    private static void s_uarimax(SparseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        if (n <= 0) {
            throw new DMLRuntimeException("rowIndexMax is undefined for ncol=" + n);
        }
        for (int i = rl; i < ru; ++i) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                int maxindex = LibMatrixAgg.indexmax(a.values(i), apos, init, alen, builtin);
                double maxvalue = avals[apos + maxindex];
                c.set(i, 0, (double)aix[apos + maxindex] + 1.0);
                c.set(i, 1, maxvalue);
                if (alen >= n || builtin.execute(0.0, maxvalue) != 1.0) continue;
                int ix = n - 1;
                for (int j = apos + alen - 1; j >= apos && aix[j] == ix; --j, --ix) {
                }
                c.set(i, 0, ix + 1);
                c.set(i, 1, 0.0);
                continue;
            }
            c.set(i, 0, n);
            c.set(i, 1, 0.0);
        }
    }

    private static void s_uarimin(SparseBlock a, DenseBlock c, int n, double init, Builtin builtin, int rl, int ru) {
        if (n <= 0) {
            throw new DMLRuntimeException("rowIndexMin is undefined for ncol=" + n);
        }
        for (int i = rl; i < ru; ++i) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                int minindex = LibMatrixAgg.indexmin(avals, apos, init, alen, builtin);
                double minvalue = avals[apos + minindex];
                c.set(i, 0, (double)aix[apos + minindex] + 1.0);
                c.set(i, 1, minvalue);
                if (alen >= n || builtin.execute(0.0, minvalue) != 1.0) continue;
                int ix = n - 1;
                for (int j = alen - 1; j >= 0 && aix[apos + j] == ix; --j, --ix) {
                }
                c.set(i, 0, ix + 1);
                c.set(i, 1, 0.0);
                continue;
            }
            c.set(i, 0, n);
            c.set(i, 1, 0.0);
        }
    }

    private static void s_uamean(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, Mean kmean, int rl, int ru) {
        int len = (ru - rl) * n;
        int count = 0;
        count = (int)((long)count + ((long)((ru - rl) * n) - a.size(rl, ru)));
        if (a.isContiguous()) {
            int alen = (int)a.size(rl, ru);
            LibMatrixAgg.mean(a.values(rl), a.pos(rl), alen, count, kbuff, kmean);
            count += alen;
        } else {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                int alen = a.size(i);
                LibMatrixAgg.mean(a.values(i), a.pos(i), alen, count, kbuff, kmean);
                count += alen;
            }
        }
        c.set(0, 0, kbuff._sum);
        c.set(0, 1, len);
        c.set(0, 2, kbuff._correction);
    }

    private static void s_uarmean(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, Mean kmean, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            int count = a.isEmpty(i) ? n : n - a.size(i);
            kbuff.set(0.0, 0.0);
            if (!a.isEmpty(i)) {
                LibMatrixAgg.mean(a.values(i), a.pos(i), a.size(i), count, kbuff, kmean);
            }
            c.set(i, 0, kbuff._sum);
            c.set(i, 1, n);
            c.set(i, 2, kbuff._correction);
        }
    }

    private static void s_uacmean(SparseBlock a, DenseBlock c, int n, KahanObject kbuff, Mean kmean, int rl, int ru) {
        int i;
        c.set(1, 2, 0, n, ru - rl);
        double[] lc = c.values(1);
        int cpos = c.pos(1);
        if (a.isContiguous()) {
            LibMatrixAgg.countDisAgg(a.values(rl), lc, a.indexes(rl), a.pos(rl), cpos, (int)a.size(rl, ru));
        } else {
            for (i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.countDisAgg(a.values(i), lc, a.indexes(i), a.pos(i), cpos, a.size(i));
            }
        }
        if (a.isContiguous()) {
            LibMatrixAgg.meanAgg(a.values(rl), c, a.indexes(rl), a.pos(rl), (int)a.size(rl, ru), n, kbuff, kmean);
        } else {
            for (i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.meanAgg(a.values(i), c, a.indexes(i), a.pos(i), a.size(i), n, kbuff, kmean);
            }
        }
    }

    private static void s_uavar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
        int count = (ru - rl) * n - (int)a.size(rl, ru);
        cbuff.w = count;
        if (a.isContiguous()) {
            LibMatrixAgg.var(a.values(rl), a.pos(rl), (int)a.size(rl, ru), cbuff, cm);
        } else {
            for (int i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.var(a.values(i), a.pos(i), a.size(i), cbuff, cm);
            }
        }
        c.set(0, 0, cbuff.getRequiredResult(CMOperator.AggregateOperationTypes.VARIANCE));
        c.set(0, 1, cbuff.mean._sum);
        c.set(0, 2, cbuff.w);
        c.set(0, 3, cbuff.m2._correction);
        c.set(0, 4, cbuff.mean._correction);
    }

    private static void s_uarvar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            cbuff.reset();
            int count = a.isEmpty(i) ? n : n - a.size(i);
            cbuff.w = count;
            if (!a.isEmpty(i)) {
                LibMatrixAgg.var(a.values(i), a.pos(i), a.size(i), cbuff, cm);
            }
            c.set(i, 0, cbuff.getRequiredResult(CMOperator.AggregateOperationTypes.VARIANCE));
            c.set(i, 1, cbuff.mean._sum);
            c.set(i, 2, cbuff.w);
            c.set(i, 3, cbuff.m2._correction);
            c.set(i, 4, cbuff.mean._correction);
        }
    }

    private static void s_uacvar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
        int i;
        c.set(2, 3, 0, n, ru - rl);
        double[] lc = c.values(2);
        int cpos = c.pos(2);
        if (a.isContiguous()) {
            LibMatrixAgg.countDisAgg(a.values(rl), lc, a.indexes(rl), a.pos(rl), cpos, (int)a.size(rl, ru));
        } else {
            for (i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.countDisAgg(a.values(i), lc, a.indexes(i), a.pos(i), cpos, a.size(i));
            }
        }
        if (a.isContiguous()) {
            LibMatrixAgg.varAgg(a.values(rl), c, a.indexes(rl), a.pos(rl), (int)a.size(rl, ru), n, cbuff, cm);
        } else {
            for (i = rl; i < ru; ++i) {
                if (a.isEmpty(i)) continue;
                LibMatrixAgg.varAgg(a.values(i), c, a.indexes(i), a.pos(i), a.size(i), n, cbuff, cm);
            }
        }
    }

    private static void s_uam(SparseBlock a, DenseBlock c, int n, int rl, int ru) {
        double ret = 1.0;
        for (int i = rl; i < ru; ++i) {
            if (!a.isEmpty(i)) {
                int alen = a.size(i);
                ret *= LibMatrixAgg.product(a.values(i), 0, alen);
                ret *= alen < n ? 0.0 : 1.0;
            }
            if (ret == 0.0) break;
        }
        c.set(0, 0, ret);
    }

    private static void s_uarm(SparseBlock a, DenseBlock c, int n, int rl, int ru) {
        double[] lc = c.valuesAt(0);
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            int alen = a.size(i);
            double tmp = LibMatrixAgg.product(a.values(i), 0, alen);
            lc[i] = tmp * (double)(alen >= n ? 1 : 0);
        }
    }

    private static void s_uacm(SparseBlock a, DenseBlock c, int n, int rl, int ru) {
        double[] lc = c.set(1.0).valuesAt(0);
        int[] cnt = new int[n];
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            LibMatrixAgg.countAgg(a.values(i), cnt, a.indexes(i), a.pos(i), a.size(i));
            LibMatrixMult.vectMultiplyWrite(lc, a.values(i), lc, 0, a.pos(i), 0, a.size(i));
        }
        for (int j = 0; j < n; ++j) {
            if (cnt[j] >= ru - rl) continue;
            int n2 = j;
            lc[n2] = lc[n2] * 0.0;
        }
    }

    private static void sum(double[] a, int ai, int len, KahanObject kbuff, KahanFunction kplus) {
        for (int i = ai; i < ai + len; ++i) {
            kplus.execute2(kbuff, a[i]);
        }
    }

    private static void sumAgg(double[] a, DenseBlock c, int ai, int len, KahanObject kbuff, KahanFunction kplus) {
        double[] sum = c.values(0);
        double[] corr = c.values(1);
        int pos0 = c.pos(0);
        int pos1 = c.pos(1);
        for (int i = 0; i < len; ++i) {
            kbuff._sum = sum[pos0 + i];
            kbuff._correction = corr[pos1 + i];
            kplus.execute2(kbuff, a[ai + i]);
            sum[pos0 + i] = kbuff._sum;
            corr[pos1 + i] = kbuff._correction;
        }
    }

    private static void sumAgg(double[] a, DenseBlock c, int[] aix, int ai, int len, int n, KahanObject kbuff, KahanFunction kplus) {
        double[] sum = c.values(0);
        double[] corr = c.values(1);
        int pos0 = c.pos(0);
        int pos1 = c.pos(1);
        for (int i = ai; i < ai + len; ++i) {
            int ix = aix[i];
            kbuff._sum = sum[pos0 + ix];
            kbuff._correction = corr[pos1 + ix];
            kplus.execute2(kbuff, a[i]);
            sum[pos0 + ix] = kbuff._sum;
            corr[pos1 + ix] = kbuff._correction;
        }
    }

    private static double product(double[] a, int ai, int len) {
        double val = 1.0;
        int i = 0;
        while (i < len && val != 0.0) {
            val *= a[ai];
            ++i;
            ++ai;
        }
        return val;
    }

    private static void productAgg(double[] a, double[] c, int ai, int ci, int len) {
        int i = 0;
        while (i < len) {
            int n = ci++;
            c[n] = c[n] * a[ai];
            ++i;
            ++ai;
        }
    }

    private static void productAgg(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
        for (int i = ai; i < ai + len; ++i) {
            int n = ci + aix[i];
            c[n] = c[n] * a[i];
        }
    }

    private static void mean(double[] a, int ai, int len, int count, KahanObject kbuff, Mean mean) {
        int i = 0;
        while (i < len) {
            mean.execute2(kbuff, a[ai], count + 1);
            ++i;
            ++ai;
            ++count;
        }
    }

    private static void meanAgg(double[] a, DenseBlock c, int ai, int len, KahanObject kbuff, Mean mean) {
        double[] sum = c.values(0);
        double[] count = c.values(1);
        double[] corr = c.values(2);
        int pos0 = c.pos(0);
        int pos1 = c.pos(1);
        int pos2 = c.pos(2);
        for (int i = 0; i < len; ++i) {
            kbuff._sum = sum[pos0 + i];
            double lcount = count[pos1 + i] + 1.0;
            kbuff._correction = corr[pos2 + i];
            mean.execute2(kbuff, a[ai + i], lcount);
            sum[pos0 + i] = kbuff._sum;
            count[pos1 + i] = lcount;
            corr[pos2 + i] = kbuff._correction;
        }
    }

    private static void meanAgg(double[] a, DenseBlock c, int[] aix, int ai, int len, int n, KahanObject kbuff, Mean mean) {
        double[] sum = c.values(0);
        double[] count = c.values(1);
        double[] corr = c.values(2);
        int pos0 = c.pos(0);
        int pos1 = c.pos(1);
        int pos2 = c.pos(2);
        for (int i = ai; i < ai + len; ++i) {
            int ix = aix[i];
            kbuff._sum = sum[pos0 + ix];
            double lcount = count[pos1 + ix] + 1.0;
            kbuff._correction = corr[pos2 + ix];
            mean.execute2(kbuff, a[i], lcount);
            sum[pos0 + ix] = kbuff._sum;
            count[pos1 + ix] = lcount;
            corr[pos2 + ix] = kbuff._correction;
        }
    }

    private static void var(double[] a, int ai, int len, CM_COV_Object cbuff, CM cm) {
        int i = 0;
        while (i < len) {
            cbuff = (CM_COV_Object)cm.execute((Data)cbuff, a[ai]);
            ++i;
            ++ai;
        }
    }

    private static void varAgg(double[] a, DenseBlock c, int ai, int len, CM_COV_Object cbuff, CM cm) {
        double[] var = c.values(0);
        double[] mean = c.values(1);
        double[] count = c.values(2);
        double[] m2corr = c.values(3);
        double[] mcorr = c.values(4);
        int pos0 = c.pos(0);
        int pos1 = c.pos(1);
        int pos2 = c.pos(2);
        int pos3 = c.pos(3);
        int pos4 = c.pos(4);
        for (int i = 0; i < len; ++i) {
            cbuff.w = count[pos2 + i];
            cbuff.m2._sum = var[pos0 + i] * (cbuff.w - 1.0);
            cbuff.mean._sum = mean[pos1 + i];
            cbuff.m2._correction = m2corr[pos3 + i];
            cbuff.mean._correction = mcorr[pos4 + i];
            cbuff = (CM_COV_Object)cm.execute((Data)cbuff, a[ai + i]);
            var[pos0 + i] = cbuff.getRequiredResult(CMOperator.AggregateOperationTypes.VARIANCE);
            mean[pos1 + i] = cbuff.mean._sum;
            count[pos2 + i] = cbuff.w;
            m2corr[pos3 + i] = cbuff.m2._correction;
            mcorr[pos4 + i] = cbuff.mean._correction;
        }
    }

    private static void varAgg(double[] a, DenseBlock c, int[] aix, int ai, int len, int n, CM_COV_Object cbuff, CM cm) {
        double[] var = c.values(0);
        double[] mean = c.values(1);
        double[] count = c.values(2);
        double[] m2corr = c.values(3);
        double[] mcorr = c.values(4);
        int pos0 = c.pos(0);
        int pos1 = c.pos(1);
        int pos2 = c.pos(2);
        int pos3 = c.pos(3);
        int pos4 = c.pos(4);
        for (int i = ai; i < ai + len; ++i) {
            int ix = aix[i];
            cbuff.w = count[pos2 + ix];
            cbuff.m2._sum = var[pos0 + ix] * (cbuff.w - 1.0);
            cbuff.mean._sum = mean[pos1 + ix];
            cbuff.m2._correction = m2corr[pos3 + ix];
            cbuff.mean._correction = mcorr[pos4 + ix];
            cbuff = (CM_COV_Object)cm.execute((Data)cbuff, a[i]);
            var[pos0 + ix] = cbuff.getRequiredResult(CMOperator.AggregateOperationTypes.VARIANCE);
            mean[pos1 + ix] = cbuff.mean._sum;
            count[pos2 + ix] = cbuff.w;
            m2corr[pos3 + ix] = cbuff.m2._correction;
            mcorr[pos4 + ix] = cbuff.mean._correction;
        }
    }

    private static double builtin(double[] a, int ai, double init, int len, Builtin aggop) {
        double val = init;
        int i = 0;
        while (i < len) {
            val = aggop.execute(val, a[ai]);
            ++i;
            ++ai;
        }
        return val;
    }

    private static void builtinAgg(double[] a, double[] c, int ai, int len, Builtin aggop) {
        for (int i = 0; i < len; ++i) {
            c[i] = aggop.execute(c[i], a[ai + i]);
        }
    }

    private static void builtinAgg(double[] a, double[] c, int[] aix, int ai, int len, Builtin aggop) {
        for (int i = ai; i < ai + len; ++i) {
            c[aix[i]] = aggop.execute(c[aix[i]], a[i]);
        }
    }

    private static int indexmax(double[] a, int ai, double init, int len, Builtin aggop) {
        double maxval = init;
        int maxindex = -1;
        for (int i = ai; i < ai + len; ++i) {
            maxindex = a[i] >= maxval ? i - ai : maxindex;
            maxval = a[i] >= maxval ? a[i] : maxval;
        }
        return Math.max(maxindex, 0);
    }

    private static int indexmin(double[] a, int ai, double init, int len, Builtin aggop) {
        double minval = init;
        int minindex = -1;
        for (int i = ai; i < ai + len; ++i) {
            minindex = a[i] <= minval ? i - ai : minindex;
            minval = a[i] <= minval ? a[i] : minval;
        }
        return Math.max(minindex, 0);
    }

    public static void countAgg(double[] a, int[] c, int[] aix, int ai, int len) {
        int i;
        int bn = len % 8;
        for (i = ai; i < ai + bn; ++i) {
            int n = aix[i];
            c[n] = c[n] + 1;
        }
        for (i = ai + bn; i < ai + len; i += 8) {
            int n = aix[i + 0];
            c[n] = c[n] + 1;
            int n2 = aix[i + 1];
            c[n2] = c[n2] + 1;
            int n3 = aix[i + 2];
            c[n3] = c[n3] + 1;
            int n4 = aix[i + 3];
            c[n4] = c[n4] + 1;
            int n5 = aix[i + 4];
            c[n5] = c[n5] + 1;
            int n6 = aix[i + 5];
            c[n6] = c[n6] + 1;
            int n7 = aix[i + 6];
            c[n7] = c[n7] + 1;
            int n8 = aix[i + 7];
            c[n8] = c[n8] + 1;
        }
    }

    public static void countAgg(double[] a, int[] c, int ai, int len) {
        int i;
        int bn = len % 8;
        for (i = 0; i < bn; ++i) {
            int n = i;
            c[n] = c[n] + (a[ai + i] != 0.0 ? 1 : 0);
        }
        for (i = bn; i < len; i += 8) {
            int n = i + 0;
            c[n] = c[n] + (a[ai + i + 0] != 0.0 ? 1 : 0);
            int n2 = i + 1;
            c[n2] = c[n2] + (a[ai + i + 1] != 0.0 ? 1 : 0);
            int n3 = i + 2;
            c[n3] = c[n3] + (a[ai + i + 2] != 0.0 ? 1 : 0);
            int n4 = i + 3;
            c[n4] = c[n4] + (a[ai + i + 3] != 0.0 ? 1 : 0);
            int n5 = i + 4;
            c[n5] = c[n5] + (a[ai + i + 4] != 0.0 ? 1 : 0);
            int n6 = i + 5;
            c[n6] = c[n6] + (a[ai + i + 5] != 0.0 ? 1 : 0);
            int n7 = i + 6;
            c[n7] = c[n7] + (a[ai + i + 6] != 0.0 ? 1 : 0);
            int n8 = i + 7;
            c[n8] = c[n8] + (a[ai + i + 7] != 0.0 ? 1 : 0);
        }
    }

    private static void countDisAgg(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
        int i;
        int bn = len % 8;
        for (i = ai; i < ai + bn; ++i) {
            int n = ci + aix[i];
            c[n] = c[n] - 1.0;
        }
        for (i = ai + bn; i < ai + len; i += 8) {
            int n = ci + aix[i + 0];
            c[n] = c[n] - 1.0;
            int n2 = ci + aix[i + 1];
            c[n2] = c[n2] - 1.0;
            int n3 = ci + aix[i + 2];
            c[n3] = c[n3] - 1.0;
            int n4 = ci + aix[i + 3];
            c[n4] = c[n4] - 1.0;
            int n5 = ci + aix[i + 4];
            c[n5] = c[n5] - 1.0;
            int n6 = ci + aix[i + 5];
            c[n6] = c[n6] - 1.0;
            int n7 = ci + aix[i + 6];
            c[n7] = c[n7] - 1.0;
            int n8 = ci + aix[i + 7];
            c[n8] = c[n8] - 1.0;
        }
    }

    private static class GrpAggTask
    extends AggTask {
        private MatrixBlock _groups = null;
        private MatrixBlock _target = null;
        private MatrixBlock _weights = null;
        private MatrixBlock _ret = null;
        private int _numGroups = -1;
        private Operator _op = null;
        private int _cl = -1;
        private int _cu = -1;

        protected GrpAggTask(MatrixBlock groups, MatrixBlock target, MatrixBlock weights, MatrixBlock ret, int numGroups, Operator op, int cl, int cu) {
            this._groups = groups;
            this._target = target;
            this._weights = weights;
            this._ret = ret;
            this._numGroups = numGroups;
            this._op = op;
            this._cl = cl;
            this._cu = cu;
        }

        @Override
        public Object call() {
            if (this._op instanceof CMOperator) {
                CMOperator cmOp = (CMOperator)this._op;
                LibMatrixAgg.groupedAggregateCM(this._groups, this._target, this._weights, this._ret, this._numGroups, cmOp, this._cl, this._cu);
            } else if (this._op instanceof AggregateOperator) {
                AggregateOperator aggop = (AggregateOperator)this._op;
                LibMatrixAgg.groupedAggregateKahanPlus(this._groups, this._target, this._weights, this._ret, this._numGroups, aggop, this._cl, this._cu);
            }
            return null;
        }
    }

    private static class AggTernaryTask
    implements Callable<MatrixBlock> {
        private final MatrixBlock _in1;
        private final MatrixBlock _in2;
        private final MatrixBlock _in3;
        private MatrixBlock _ret = null;
        private final IndexFunction _ixFn;
        private final int _rl;
        private final int _ru;

        protected AggTernaryTask(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, MatrixBlock ret, IndexFunction ixFn, int rl, int ru) {
            this._in1 = in1;
            this._in2 = in2;
            this._in3 = in3;
            this._ret = ret;
            this._ixFn = ixFn;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public MatrixBlock call() {
            this._ret = new MatrixBlock(this._ret.rlen, this._ret.clen, false);
            this._ret.allocateDenseBlock();
            if (!(this._in1.sparse || this._in2.sparse || this._in3 != null && this._in3.sparse)) {
                LibMatrixAgg.aggregateTernaryDense(this._in1, this._in2, this._in3, this._ret, this._ixFn, this._rl, this._ru);
            } else {
                LibMatrixAgg.aggregateTernaryGeneric(this._in1, this._in2, this._in3, this._ret, this._ixFn, this._rl, this._ru);
            }
            this._ret.recomputeNonZeros();
            return this._ret;
        }
    }

    private static class CumAggTask
    implements Callable<Long> {
        private MatrixBlock _in = null;
        private double[] _agg = null;
        private MatrixBlock _ret = null;
        private AggType _aggtype = null;
        private UnaryOperator _uop = null;
        private int _rl = -1;
        private int _ru = -1;

        protected CumAggTask(MatrixBlock in, double[] agg, MatrixBlock ret, AggType aggtype, UnaryOperator uop, int rl, int ru) {
            this._in = in;
            this._agg = agg;
            this._ret = ret;
            this._aggtype = aggtype;
            this._uop = uop;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Long call() {
            if (!this._in.sparse) {
                LibMatrixAgg.cumaggregateUnaryMatrixDense(this._in, this._ret, this._aggtype, this._uop.fn, this._agg, this._rl, this._ru);
            } else {
                LibMatrixAgg.cumaggregateUnaryMatrixSparse(this._in, this._ret, this._aggtype, this._uop.fn, this._agg, this._rl, this._ru);
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class PartialAggTask
    extends AggTask {
        private MatrixBlock _in = null;
        private MatrixBlock _ret = null;
        private AggType _aggtype = null;
        private AggregateUnaryOperator _uaop = null;
        private int _rl = -1;
        private int _ru = -1;

        protected PartialAggTask(MatrixBlock in, MatrixBlock ret, AggType aggtype, AggregateUnaryOperator uaop, int rl, int ru) {
            this._in = in;
            this._ret = ret;
            this._aggtype = aggtype;
            this._uaop = uaop;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() {
            this._ret = new MatrixBlock(this._ret.rlen, this._ret.clen, false);
            this._ret.allocateDenseBlock();
            if (!this._in.sparse) {
                LibMatrixAgg.aggregateUnaryMatrixDense(this._in, this._ret, this._aggtype, this._uaop.aggOp.increOp.fn, this._uaop.indexFn, this._rl, this._ru);
            } else {
                LibMatrixAgg.aggregateUnaryMatrixSparse(this._in, this._ret, this._aggtype, this._uaop.aggOp.increOp.fn, this._uaop.indexFn, this._rl, this._ru);
            }
            this._ret.recomputeNonZeros();
            return null;
        }

        public MatrixBlock getResult() {
            return this._ret;
        }
    }

    private static class RowAggTask
    extends AggTask {
        private MatrixBlock _in = null;
        private MatrixBlock _ret = null;
        private AggType _aggtype = null;
        private AggregateUnaryOperator _uaop = null;
        private int _rl = -1;
        private int _ru = -1;

        protected RowAggTask(MatrixBlock in, MatrixBlock ret, AggType aggtype, AggregateUnaryOperator uaop, int rl, int ru) {
            this._in = in;
            this._ret = ret;
            this._aggtype = aggtype;
            this._uaop = uaop;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() {
            if (!this._in.sparse) {
                LibMatrixAgg.aggregateUnaryMatrixDense(this._in, this._ret, this._aggtype, this._uaop.aggOp.increOp.fn, this._uaop.indexFn, this._rl, this._ru);
            } else {
                LibMatrixAgg.aggregateUnaryMatrixSparse(this._in, this._ret, this._aggtype, this._uaop.aggOp.increOp.fn, this._uaop.indexFn, this._rl, this._ru);
            }
            return null;
        }
    }

    private static abstract class AggTask
    implements Callable<Object> {
        private AggTask() {
        }
    }

    private static enum AggType {
        KAHAN_SUM,
        KAHAN_SUM_SQ,
        CUM_KAHAN_SUM,
        CUM_MIN,
        CUM_MAX,
        CUM_PROD,
        CUM_SUM_PROD,
        MIN,
        MAX,
        MEAN,
        VAR,
        MAX_INDEX,
        MIN_INDEX,
        PROD,
        INVALID;

    }
}

