/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.matrix.data;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.stream.IntStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.util.FastMath;
import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.lops.MapMultChain;
import org.apache.sysds.lops.WeightedCrossEntropy;
import org.apache.sysds.lops.WeightedDivMM;
import org.apache.sysds.lops.WeightedSigmoid;
import org.apache.sysds.lops.WeightedSquaredLoss;
import org.apache.sysds.lops.WeightedUnaryMM;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.DenseBlockFactory;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.data.SparseBlockCSR;
import org.apache.sysds.runtime.data.SparseBlockFactory;
import org.apache.sysds.runtime.data.SparseBlockMCSR;
import org.apache.sysds.runtime.data.SparseRowScalar;
import org.apache.sysds.runtime.functionobjects.SwapIndex;
import org.apache.sysds.runtime.functionobjects.ValueFunction;
import org.apache.sysds.runtime.matrix.data.LibMatrixNative;
import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.runtime.util.UtilFunctions;
import org.apache.sysds.utils.NativeHelper;

public class LibMatrixMult {
    private static final boolean LOW_LEVEL_OPTIMIZATION = true;
    private static final long MEM_OVERHEAD_THRESHOLD = 0x200000L;
    private static final long PAR_MINFLOP_THRESHOLD1 = 0x200000L;
    private static final long PAR_MINFLOP_THRESHOLD2 = 131072L;
    public static final int L2_CACHESIZE = 262144;
    public static final int L3_CACHESIZE = 0x1000000;
    private static final Log LOG = LogFactory.getLog((String)LibMatrixMult.class.getName());

    private LibMatrixMult() {
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) {
        LibMatrixMult.matrixMult(m1, m2, ret, 0, m1.rlen);
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet) {
        LibMatrixMult.matrixMult(m1, m2, ret, 0, m1.rlen, fixedRet);
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
        LibMatrixMult.matrixMult(m1, m2, ret, rl, ru, false);
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean fixedRet) {
        if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        boolean m1Perm = m1.isSparsePermutationMatrix();
        boolean ultraSparse = fixedRet && ret.sparse || !fixedRet && LibMatrixMult.isUltraSparseMatrixMult(m1, m2, m1Perm);
        boolean sparse = !m1Perm && !ultraSparse && !fixedRet && LibMatrixMult.isSparseOutputMatrixMult(m1, m2);
        boolean tm2 = LibMatrixMult.checkPrepMatrixMultRightInput(m1, m2);
        m2 = LibMatrixMult.prepMatrixMultRightInput(m1, m2);
        ret.sparse = ultraSparse | sparse;
        ret.allocateBlock();
        boolean pm2 = !ultraSparse && LibMatrixMult.checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE);
        int ru2 = pm2 && ru == m1.rlen ? m2.rlen : ru;
        int cu = m2.clen;
        if (ultraSparse) {
            LibMatrixMult.matrixMultUltraSparse(m1, m2, ret, m1Perm, 0, ru2);
        } else if (!m1.sparse && !m2.sparse) {
            LibMatrixMult.matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, cu);
        } else if (m1.sparse && m2.sparse) {
            LibMatrixMult.matrixMultSparseSparse(m1, m2, ret, pm2, sparse, 0, ru2);
        } else if (m1.sparse) {
            LibMatrixMult.matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2);
        } else {
            LibMatrixMult.matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2);
        }
        if (!fixedRet) {
            if (!ret.sparse) {
                ret.recomputeNonZeros();
            }
            ret.examSparsity();
        }
    }

    public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) {
        boolean pm2c;
        if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (!LibMatrixMult.satisfiesMultiThreadingConstraints(m1, m2, m1.rlen == 1, true, 2L, k)) {
            LibMatrixMult.matrixMult(m1, m2, ret);
            return;
        }
        boolean m1Perm = m1.isSparsePermutationMatrix();
        boolean ultraSparse = LibMatrixMult.isUltraSparseMatrixMult(m1, m2, m1Perm);
        boolean sparse = !ultraSparse && !m1Perm && LibMatrixMult.isSparseOutputMatrixMult(m1, m2);
        boolean tm2 = LibMatrixMult.checkPrepMatrixMultRightInput(m1, m2);
        m2 = LibMatrixMult.prepMatrixMultRightInput(m1, m2);
        ret.sparse = ultraSparse | sparse;
        ret.allocateBlock();
        if (!ret.isThreadSafe()) {
            LibMatrixMult.matrixMult(m1, m2, ret);
            return;
        }
        boolean pm2r = !ultraSparse && !sparse && LibMatrixMult.checkParMatrixMultRightInputRows(m1, m2, k);
        boolean bl = pm2c = !ultraSparse && LibMatrixMult.checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
        int num = pm2r ? m2.rlen : (pm2c ? m2.clen : m1.rlen);
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultTask> tasks = new ArrayList<MatrixMultTask>();
            ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, pm2r || pm2c);
            int lb = 0;
            for (int i = 0; i < blklens.size(); ++i) {
                tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i)));
                lb += blklens.get(i).intValue();
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                if (pm2r) {
                    LibMatrixMult.vectAdd((double[])task.get(), ret.getDenseBlockValues(), 0, 0, ret.rlen * ret.clen);
                    continue;
                }
                ret.nonZeros += ((Long)task.get()).longValue();
            }
            if (pm2r) {
                ret.recomputeNonZeros();
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        ret.examSparsity();
    }

    public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct) {
        if (mX.isEmptyBlock(false) || mV.isEmptyBlock(false) && ct != MapMultChain.ChainType.XtXvy || mW != null && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        if (mX.sparse) {
            LibMatrixMult.matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen);
        } else {
            LibMatrixMult.matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct, int k) {
        if (mX.isEmptyBlock(false) || mV.isEmptyBlock(false) && ct != MapMultChain.ChainType.XtXvy || mW != null && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (!LibMatrixMult.satisfiesMultiThreadingConstraints(mX, true, true, mX.sparse ? 2L : 4L, k)) {
            LibMatrixMult.matrixMultChain(mX, mV, mW, ret, ct);
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(mX.rlen, k, true);
            ArrayList<MatrixMultChainTask> tasks = new ArrayList<MatrixMultChainTask>();
            int lb = 0;
            for (int i = 0; i < blklens.size(); ++i) {
                tasks.add(new MatrixMultChainTask(mX, mV, mW, ct, lb, lb + blklens.get(i)));
                lb += blklens.get(i).intValue();
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            double[][] a = new double[taskret.size()][];
            for (int i = 0; i < taskret.size(); ++i) {
                a[i] = (double[])taskret.get(i).get();
            }
            LibMatrixMult.vectAddAll(a, ret.getDenseBlockValues(), 0, 0, mX.clen);
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose) {
        LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTranspose, true);
    }

    public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, boolean copyToLowerTriangle) {
        if (m1.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        if (m1.sparse) {
            LibMatrixMult.matrixMultTransposeSelfSparse(m1, ret, leftTranspose, 0, ret.rlen);
        } else {
            LibMatrixMult.matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, ret.rlen);
        }
        if (copyToLowerTriangle) {
            long nnz = LibMatrixMult.copyUpperToLowerTriangle(ret);
            ret.setNonZeros(nnz);
            ret.examSparsity();
        }
    }

    public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int k) {
        if (m1.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (!LibMatrixMult.satisfiesMultiThreadingConstraintsTSMM(m1, leftTranspose, 1L, k)) {
            LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTranspose);
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<MatrixMultTransposeTask>();
            int blklen = (int)Math.ceil((double)ret.rlen / (double)(2 * k));
            int i = 0;
            while (i < 2 * k & i * blklen < ret.rlen) {
                tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i * blklen, Math.min((i + 1) * blklen, ret.rlen)));
                ++i;
            }
            List rtasks = pool.invokeAll(tasks);
            pool.shutdown();
            for (Future rtask : rtasks) {
                rtask.get();
            }
        }
        catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        long nnz = LibMatrixMult.copyUpperToLowerTriangle(ret);
        ret.setNonZeros(nnz);
        ret.examSparsity();
    }

    public static void matrixMultPermute(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2) {
        if (pm1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            return;
        }
        boolean bl = ret1.sparse = m2.sparse || ret1.sparse;
        if (ret1.sparse) {
            ret1.allocateSparseRowsBlock();
        } else {
            ret1.allocateDenseBlock();
        }
        if (m2.sparse) {
            LibMatrixMult.matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
        } else if (ret1.sparse) {
            LibMatrixMult.matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
        } else {
            LibMatrixMult.matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen);
        }
        ret1.recomputeNonZeros();
        ret1.examSparsity();
        if (ret2 != null) {
            ret2.recomputeNonZeros();
            ret2.examSparsity();
        }
    }

    public static void matrixMultPermute(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) {
        if (pm1.isEmptyBlock(false) || m2.isEmptyBlock(false)) {
            return;
        }
        if (pm1.rlen == 1) {
            LibMatrixMult.matrixMultPermute(pm1, m2, ret1, ret2);
            return;
        }
        ret1.sparse = false;
        ret1.allocateDenseBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultPermuteTask> tasks = new ArrayList<MatrixMultPermuteTask>();
            int blklen = (int)Math.ceil((double)pm1.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < pm1.rlen) {
                tasks.add(new MatrixMultPermuteTask(pm1, m2, ret1, ret2, i * blklen, Math.min((i + 1) * blklen, pm1.rlen)));
                ++i;
            }
            pool.invokeAll(tasks);
            pool.shutdown();
        }
        catch (InterruptedException e) {
            throw new DMLRuntimeException(e);
        }
        ret1.recomputeNonZeros();
        ret1.examSparsity();
        if (ret2 != null) {
            ret2.recomputeNonZeros();
            ret2.examSparsity();
        }
    }

    public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt) {
        if (wt == WeightedSquaredLoss.WeightsType.POST && mW.isEmptyBlock(false) || wt == WeightedSquaredLoss.WeightsType.POST_NZ && mX.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (!(mX.sparse || mU.sparse || mV.sparse || mW != null && mW.sparse || mX.isEmptyBlock() || mU.isEmptyBlock() || mV.isEmptyBlock() || mW != null && mW.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSLossDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
        } else if (!(!mX.sparse || mU.sparse || mV.sparse || mW != null && !mW.sparse || mX.isEmptyBlock() || mU.isEmptyBlock() || mV.isEmptyBlock() || mW != null && mW.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSLossSparseDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
        } else {
            LibMatrixMult.matrixMultWSLossGeneric(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
        }
        if (mX.sparse && wt == WeightedSquaredLoss.WeightsType.NONE) {
            LibMatrixMult.addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, 1);
        }
    }

    public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int k) {
        if (wt == WeightedSquaredLoss.WeightsType.POST && mW.isEmptyBlock(false) || wt == WeightedSquaredLoss.WeightsType.POST_NZ && mX.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (mX.rlen == 1) {
            LibMatrixMult.matrixMultWSLoss(mX, mU, mV, mW, ret, wt);
            return;
        }
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultWSLossTask> tasks = new ArrayList<MatrixMultWSLossTask>();
            int blklen = (int)Math.ceil((double)mX.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mX.rlen) {
                tasks.add(new MatrixMultWSLossTask(mX, mU, mV, mW, wt, i * blklen, Math.min((i + 1) * blklen, mX.rlen)));
                ++i;
            }
            List<Future<Double>> taskret = pool.invokeAll(tasks);
            pool.shutdown();
            LibMatrixMult.sumScalarResults(taskret, ret);
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        if (mX.sparse && wt == WeightedSquaredLoss.WeightsType.NONE) {
            LibMatrixMult.addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k);
        }
    }

    public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt) {
        boolean allDense;
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateBlock();
        boolean bl = allDense = !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock();
        if (NativeHelper.isNativeLibraryLoaded() && allDense && (mW.rlen == 1 || mW.clen == 1) && !LibMatrixNative.isMatMultMemoryBound(mU.rlen, mU.clen, mV.rlen) && mW.getDenseBlock().isContiguous() && mU.getDenseBlock().isContiguous() && mV.getDenseBlock().isContiguous()) {
            LibMatrixMult.matrixMultWSigmoidDenseNative(mW, mU, mV, ret, wt);
        } else if (allDense) {
            LibMatrixMult.matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen);
        } else {
            LibMatrixMult.matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int k) {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
            LibMatrixMult.matrixMultWSigmoid(mW, mU, mV, ret, wt);
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<MatrixMultWSigmoidTask>();
            int blklen = (int)Math.ceil((double)mW.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mW.rlen) {
                tasks.add(new MatrixMultWSigmoidTask(mW, mU, mV, ret, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                ret.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        ret.examSparsity();
    }

    public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt) {
        if (mW.isEmptyBlock(false) || wt.isLeft() && mU.isEmptyBlock(false) || wt.isRight() && mV.isEmptyBlock(false) || wt.isBasic() && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = wt.isBasic() ? mW.sparse : false;
        ret.allocateBlock();
        boolean scalarX = wt.hasScalar();
        if (!(mW.sparse || mU.sparse || mV.sparse || mX != null && mX.sparse && !scalarX || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mX != null && !mX.sparse && !scalarX || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
        } else {
            LibMatrixMult.matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int k) {
        if (mW.isEmptyBlock(false) || wt.isLeft() && mU.isEmptyBlock(false) || wt.isRight() && mV.isEmptyBlock(false) || wt.isBasic() && mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = wt.isBasic() ? mW.sparse : false;
        ret.allocateBlock();
        if (!ret.isThreadSafe()) {
            LibMatrixMult.matrixMultWDivMM(mW, mU, mV, mX, ret, wt);
            return;
        }
        try {
            int blklen;
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultWDivTask> tasks = new ArrayList<MatrixMultWDivTask>();
            if (wt.isLeft()) {
                blklen = (int)Math.ceil((double)mW.clen / (double)k);
                int j = 0;
                while (j < k & j * blklen < mW.clen) {
                    tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, 0, mW.rlen, j * blklen, Math.min((j + 1) * blklen, mW.clen)));
                    ++j;
                }
            } else {
                blklen = (int)Math.ceil((double)mW.rlen / (double)k);
                int i = 0;
                while (i < k & i * blklen < mW.rlen) {
                    tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen), 0, mW.clen));
                    ++i;
                }
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                ret.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        ret.examSparsity();
    }

    public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt) {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        if (!(mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWCeMMDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWCeMMSparseDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
        } else {
            LibMatrixMult.matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
        }
    }

    public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int k) {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = false;
        ret.allocateDenseBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultWCeTask> tasks = new ArrayList<MatrixMultWCeTask>();
            int blklen = (int)Math.ceil((double)mW.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mW.rlen) {
                tasks.add(new MatrixMultWCeTask(mW, mU, mV, eps, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen)));
                ++i;
            }
            List<Future<Double>> taskret = pool.invokeAll(tasks);
            pool.shutdown();
            LibMatrixMult.sumScalarResults(taskret, ret);
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
    }

    public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn) {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateBlock();
        if (!(mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWuMMDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
        } else if (!(!mW.sparse || mU.sparse || mV.sparse || mU.isEmptyBlock() || mV.isEmptyBlock())) {
            LibMatrixMult.matrixMultWuMMSparseDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
        } else {
            LibMatrixMult.matrixMultWuMMGeneric(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
        }
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int k) {
        if (mW.isEmptyBlock(false)) {
            ret.examSparsity();
            return;
        }
        if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
            LibMatrixMult.matrixMultWuMM(mW, mU, mV, ret, wt, fn);
            return;
        }
        ret.sparse = mW.sparse;
        ret.allocateBlock();
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultWuTask> tasks = new ArrayList<MatrixMultWuTask>();
            int blklen = (int)Math.ceil((double)mW.rlen / (double)k);
            int i = 0;
            while (i < k & i * blklen < mW.rlen) {
                tasks.add(new MatrixMultWuTask(mW, mU, mV, ret, wt, fn, i * blklen, Math.min((i + 1) * blklen, mW.rlen)));
                ++i;
            }
            List taskret = pool.invokeAll(tasks);
            pool.shutdown();
            ret.nonZeros = 0L;
            for (Future task : taskret) {
                ret.nonZeros += ((Long)task.get()).longValue();
            }
        }
        catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
        ret.examSparsity();
    }

    private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) {
        DenseBlock a = m1.getDenseBlock();
        DenseBlock b = m2.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        int m = m1.rlen;
        int n = m2.clen;
        int cd = m1.clen;
        if (m == 1 && n == 1) {
            double[] avals = a.valuesAt(0);
            double[] bvals = b.valuesAt(0);
            c.set(0, 0, LibMatrixMult.dotProduct(avals, bvals, cd));
        } else if (n > 1 && cd == 1) {
            double[] avals = a.valuesAt(0);
            double[] bvals = b.valuesAt(0);
            for (int i = rl; i < ru; ++i) {
                double[] cvals = c.values(i);
                int cix = c.pos(i);
                if (avals[i] == 1.0) {
                    System.arraycopy(bvals, 0, cvals, cix, n);
                    continue;
                }
                if (avals[i] != 0.0) {
                    LibMatrixMult.vectMultiplyWrite(avals[i], bvals, cvals, 0, cix, n);
                    continue;
                }
                Arrays.fill(cvals, cix, cix + n, 0.0);
            }
        } else if (n == 1 && cd == 1) {
            double[] avals = a.valuesAt(0);
            double[] cvals = c.valuesAt(0);
            LibMatrixMult.vectMultiplyWrite(b.get(0, 0), avals, cvals, rl, rl, ru - rl);
        } else if (n == 1 && cd <= 2048) {
            LibMatrixMult.matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
        } else if (n == 1) {
            LibMatrixMult.matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
        } else if (pm2 && m == 1) {
            LibMatrixMult.matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
        } else if (pm2 && m <= 16) {
            LibMatrixMult.matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
        } else if (tm2) {
            LibMatrixMult.matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
        } else {
            LibMatrixMult.matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
        }
    }

    private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
        double[] bvals = b.valuesAt(0);
        double[] cvals = c.valuesAt(0);
        for (int i = rl; i < ru; ++i) {
            cvals[i] = LibMatrixMult.dotProduct(a.values(i), bvals, a.pos(i), 0, cd);
        }
    }

    private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
        int blocksizeI = 32;
        int blocksizeK = 2048;
        double[] bvals = b.valuesAt(0);
        double[] cvals = c.valuesAt(0);
        for (int bi = rl; bi < ru; bi += 32) {
            int bimin = Math.min(bi + 32, ru);
            for (int bk = 0; bk < cd; bk += 2048) {
                int bkmin = Math.min(bk + 2048, cd);
                for (int i = bi; i < bimin; ++i) {
                    int n = i;
                    cvals[n] = cvals[n] + LibMatrixMult.dotProduct(a.values(i), bvals, a.pos(i, bk), bk, bkmin - bk);
                }
            }
        }
    }

    private static void matrixMultDenseDenseVM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru) {
        double[] avals = a.valuesAt(0);
        double[] cvals = c.valuesAt(0);
        int kn = b.isContiguous() ? rl + (ru - rl) % 2 : ru;
        for (int k = rl; k < kn; ++k) {
            if (avals[k] == 0.0) continue;
            LibMatrixMult.vectMultiplyAdd(avals[k], b.values(k), cvals, b.pos(k), 0, n);
        }
        double[] bvals = b.valuesAt(0);
        int k = kn;
        int bix = kn * n;
        while (k < ru) {
            if (avals[k] != 0.0 && avals[k + 1] != 0.0) {
                LibMatrixMult.vectMultiplyAdd2(avals[k], avals[k + 1], bvals, cvals, bix, bix + n, 0, n);
            } else if (avals[k] != 0.0) {
                LibMatrixMult.vectMultiplyAdd(avals[k], bvals, cvals, bix, 0, n);
            } else if (avals[k + 1] != 0.0) {
                LibMatrixMult.vectMultiplyAdd(avals[k + 1], bvals, cvals, bix + n, 0, n);
            }
            k += 2;
            bix += 2 * n;
        }
    }

    private static void matrixMultDenseDenseMMShortLHS(DenseBlock a, DenseBlock b, DenseBlock c, int m, int n, int cd, int rl, int ru) {
        int kn = (ru - rl) % 4;
        for (int i = 0; i < m; ++i) {
            double[] avals = a.values(i);
            double[] cvals = c.values(i);
            int aix = a.pos(i);
            int cix = c.pos(i);
            for (int k = rl; k < rl + kn; ++k) {
                if (avals[aix + k] == 0.0) continue;
                LibMatrixMult.vectMultiplyAdd(avals[aix + k], b.values(k), cvals, b.pos(k), cix, n);
            }
        }
        int blocksizeK = 48;
        int blocksizeJ = 1024;
        for (int bk = rl + kn; bk < ru; bk += 48) {
            int bkmin = Math.min(ru, bk + 48);
            for (int bj = 0; bj < n; bj += 1024) {
                int bjlen = Math.min(n, bj + 1024) - bj;
                for (int i = 0; i < m; ++i) {
                    double[] avals = a.values(i);
                    double[] cvals = c.values(i);
                    int aix = a.pos(i);
                    int cix = c.pos(i, bj);
                    if (b.isContiguous(bk, bkmin - 1)) {
                        double[] bvals = b.values(bk);
                        int k = bk;
                        int bix = b.pos(bk, bj);
                        while (k < bkmin) {
                            LibMatrixMult.vectMultiplyAdd4(avals[aix + k], avals[aix + k + 1], avals[aix + k + 2], avals[aix + k + 3], bvals, cvals, bix, bix + n, bix + 2 * n, bix + 3 * n, cix, bjlen);
                            k += 4;
                            bix += 4 * n;
                        }
                        continue;
                    }
                    for (int k = rl; k < rl + kn; ++k) {
                        if (avals[aix + k] == 0.0) continue;
                        LibMatrixMult.vectMultiplyAdd(avals[aix + k], b.values(k), cvals, b.pos(k), cix, n);
                    }
                }
            }
        }
    }

    private static void matrixMultDenseDenseMMSkinnyRHS(DenseBlock a, DenseBlock b, DenseBlock c, int n2, int cd, int rl, int ru) {
        for (int i = rl; i < ru; ++i) {
            double[] avals = a.values(i);
            double[] cvals = c.values(i);
            int aix = a.pos(i);
            int cix = c.pos(i);
            for (int j = 0; j < n2; ++j) {
                cvals[cix + j] = LibMatrixMult.dotProduct(avals, b.values(j), aix, b.pos(j), cd);
            }
        }
    }

    public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) {
        int blocksizeI = 32;
        int blocksizeK = 24;
        int blocksizeJ = 1024;
        double[] ta = new double[24];
        int[] tbi = new int[24];
        for (int bi = rl; bi < ru; bi += 32) {
            int bimin = Math.min(ru, bi + 32);
            for (int bk = 0; bk < cd; bk += 24) {
                int bkmin = Math.min(cd, bk + 24);
                for (int bj = cl; bj < cu; bj += 1024) {
                    int bklen = bkmin - bk;
                    int bjlen = Math.min(cu, bj + 1024) - bj;
                    for (int i = bi; i < bimin; ++i) {
                        double[] avals = a.values(i);
                        double[] cvals = c.values(i);
                        int aixi = a.pos(i, bk);
                        int cixj = c.pos(i, bj);
                        if (b.isContiguous(bk, bkmin - 1)) {
                            double[] bvals = b.values(bk);
                            int bkpos = b.pos(bk, bj);
                            int knnz = LibMatrixMult.copyNonZeroElements(avals, aixi, bkpos, n, ta, tbi, bklen);
                            int bn = knnz % 4;
                            switch (bn) {
                                case 1: {
                                    LibMatrixMult.vectMultiplyAdd(ta[0], bvals, cvals, tbi[0], cixj, bjlen);
                                    break;
                                }
                                case 2: {
                                    LibMatrixMult.vectMultiplyAdd2(ta[0], ta[1], bvals, cvals, tbi[0], tbi[1], cixj, bjlen);
                                    break;
                                }
                                case 3: {
                                    LibMatrixMult.vectMultiplyAdd3(ta[0], ta[1], ta[2], bvals, cvals, tbi[0], tbi[1], tbi[2], cixj, bjlen);
                                }
                            }
                            for (int k = bn; k < knnz; k += 4) {
                                LibMatrixMult.vectMultiplyAdd4(ta[k], ta[k + 1], ta[k + 2], ta[k + 3], bvals, cvals, tbi[k], tbi[k + 1], tbi[k + 2], tbi[k + 3], cixj, bjlen);
                            }
                            continue;
                        }
                        for (int k = bk; k < bkmin; ++k) {
                            if (avals[k] == 0.0) continue;
                            LibMatrixMult.vectMultiplyAdd(avals[k], b.values(k), cvals, b.pos(k, bj), cixj, bjlen);
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) {
        DenseBlock a = m1.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        int m = m1.rlen;
        int cd = m1.clen;
        SparseBlock b = m2.sparseBlock;
        if (pm2 && m == 1) {
            double[] avals = a.valuesAt(0);
            double[] cvals = c.valuesAt(0);
            for (int k = rl; k < ru; ++k) {
                if (avals[k] == 0.0 || b.isEmpty(k)) continue;
                LibMatrixMult.vectMultiplyAdd(avals[k], b.values(k), cvals, b.indexes(k), b.pos(k), 0, b.size(k));
            }
        } else {
            int blocksizeK = 32;
            int blocksizeI = 32;
            for (int bi = rl; bi < ru; bi += 32) {
                int bimin = Math.min(ru, bi + 32);
                for (int bk = 0; bk < cd; bk += 32) {
                    int bkmin = Math.min(cd, bk + 32);
                    for (int i = bi; i < bimin; ++i) {
                        double[] avals = a.values(i);
                        double[] cvals = c.values(i);
                        int aix = a.pos(i);
                        int cix = c.pos(i);
                        for (int k = bk; k < bkmin; ++k) {
                            double aval = avals[aix + k];
                            if (aval == 0.0 || b.isEmpty(k)) continue;
                            LibMatrixMult.vectMultiplyAdd(aval, b.values(k), cvals, b.indexes(k), b.pos(k), cix, b.size(k));
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) {
        SparseBlock a = m1.sparseBlock;
        DenseBlock b = m2.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        int m = m1.rlen;
        int n = m2.clen;
        int cd = m2.rlen;
        long xsp = (long)m * (long)cd / m1.nonZeros;
        if (m == 1 && n == 1) {
            if (!a.isEmpty(0)) {
                c.set(0, 0, LibMatrixMult.dotProduct(a.values(0), b.values(0), a.indexes(0), a.pos(0), 0, a.size(0)));
            }
        } else if (n == 1 && cd <= 2048) {
            LibMatrixMult.matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru);
        } else if (n == 1) {
            LibMatrixMult.matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
        } else if (pm2 && m == 1) {
            LibMatrixMult.matrixMultSparseDenseVM(a, b, c, n, rl, ru);
        } else if (pm2 && m <= 16) {
            LibMatrixMult.matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
        } else if (n <= 64) {
            LibMatrixMult.matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
        } else {
            LibMatrixMult.matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
        }
    }

    private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
        double[] bvals = b.valuesAt(0);
        double[] cvals = c.valuesAt(0);
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            int alen = a.size(i);
            int apos = a.pos(i);
            double[] avals = a.values(i);
            cvals[i] = alen == cd ? LibMatrixMult.dotProduct(avals, bvals, apos, 0, cd) : LibMatrixMult.dotProduct(avals, bvals, a.indexes(i), apos, 0, alen);
        }
    }

    private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, DenseBlock b, DenseBlock c, int cd, long xsp, int rl, int ru) {
        double[] bvals = b.valuesAt(0);
        double[] cvals = c.valuesAt(0);
        int blocksizeI = 512;
        int blocksizeK = (int)Math.max(2048L, 2048L * xsp / 32L);
        int[] curk = new int[512];
        for (int bi = rl; bi < ru; bi += 512) {
            Arrays.fill(curk, 0);
            int bimin = Math.min(ru, bi + 512);
            for (int bk = 0; bk < cd; bk += blocksizeK) {
                int bkmin = bk + blocksizeK;
                for (int i = bi; i < bimin; ++i) {
                    int k;
                    if (a.isEmpty(i)) continue;
                    int apos = a.pos(i);
                    int alen = a.size(i);
                    int[] aix = a.indexes(i);
                    double[] avals = a.values(i);
                    for (k = curk[i - bi] + apos; k < apos + alen && aix[k] < bkmin; ++k) {
                        int n = i;
                        cvals[n] = cvals[n] + avals[k] * bvals[aix[k]];
                    }
                    curk[i - bi] = k - apos;
                }
            }
        }
    }

    private static void matrixMultSparseDenseVM(SparseBlock a, DenseBlock b, DenseBlock c, int n, int rl, int ru) {
        if (a.isEmpty(0)) {
            return;
        }
        int alen = a.size(0);
        int[] aix = a.indexes(0);
        double[] avals = a.values(0);
        double[] cvals = c.valuesAt(0);
        int rlix = rl == 0 ? 0 : a.posFIndexGTE(0, rl);
        int n2 = rlix = rlix >= 0 ? rlix : alen;
        if (b.isContiguous()) {
            double[] bvals = b.valuesAt(0);
            for (int k = rlix; k < alen && aix[k] < ru; ++k) {
                if (k + 1 < alen && aix[k + 1] < ru) {
                    LibMatrixMult.vectMultiplyAdd2(avals[k], avals[k + 1], bvals, cvals, aix[k] * n, aix[++k] * n, 0, n);
                    continue;
                }
                LibMatrixMult.vectMultiplyAdd(avals[k], bvals, cvals, aix[k] * n, 0, n);
            }
        } else {
            for (int k = rlix; k < alen && aix[k] < ru; ++k) {
                LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), 0, n);
            }
        }
    }

    private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru) {
        int arlen = a.numRows();
        for (int i = 0; i < arlen; ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            double[] cvals = c.values(i);
            int cix = c.pos(i);
            int k1 = rl == 0 ? 0 : a.posFIndexGTE(i, rl);
            k1 = k1 >= 0 ? apos + k1 : apos + alen;
            int k2 = ru == cd ? alen : a.posFIndexGTE(i, ru);
            int n2 = k2 = k2 >= 0 ? apos + k2 : apos + alen;
            if (k1 < apos + alen && (k1 == k2 || b.isContiguous(aix[k1], aix[k2 - 1]))) {
                double[] bvals = b.values(aix[k1]);
                int base = aix[k1] * n - b.pos(aix[k1]);
                int bn = (k2 - k1) % 4;
                switch (bn) {
                    case 1: {
                        LibMatrixMult.vectMultiplyAdd(avals[k1], bvals, cvals, aix[k1] * n - base, cix, n);
                        break;
                    }
                    case 2: {
                        LibMatrixMult.vectMultiplyAdd2(avals[k1], avals[k1 + 1], bvals, cvals, aix[k1] * n - base, aix[k1 + 1] * n - base, cix, n);
                        break;
                    }
                    case 3: {
                        LibMatrixMult.vectMultiplyAdd3(avals[k1], avals[k1 + 1], avals[k1 + 2], bvals, cvals, aix[k1] * n - base, aix[k1 + 1] * n - base, aix[k1 + 2] * n - base, cix, n);
                    }
                }
                for (int k = k1 + bn; k < k2; k += 4) {
                    LibMatrixMult.vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], bvals, cvals, aix[k] * n - base, aix[k + 1] * n - base, aix[k + 2] * n - base, aix[k + 3] * n - base, cix, n);
                }
                continue;
            }
            for (int k = k1; k < k2; ++k) {
                LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), cix, n);
            }
        }
    }

    private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, DenseBlock b, DenseBlock c, int n, int rl, int ru) {
        int i = rl;
        int cix = rl * n;
        while (i < ru) {
            if (!a.isEmpty(i)) {
                int apos = a.pos(i);
                int alen = a.size(i);
                int[] aix = a.indexes(i);
                double[] avals = a.values(i);
                double[] cvals = c.values(i);
                int bn = b.isContiguous() ? alen % 4 : alen;
                for (int k = apos; k < apos + bn; ++k) {
                    LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), cix, n);
                }
                double[] bvals = b.valuesAt(0);
                for (int k = apos + bn; k < apos + alen; k += 4) {
                    LibMatrixMult.vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], bvals, cvals, aix[k] * n, aix[k + 1] * n, aix[k + 2] * n, aix[k + 3] * n, cix, n);
                }
            }
            ++i;
            cix += n;
        }
    }

    private static void matrixMultSparseDenseMM(SparseBlock a, DenseBlock b, DenseBlock c, int n, int cd, long xsp, int rl, int ru) {
        int blocksizeI = (int)(8L * xsp);
        int blocksizeK = (int)(8L * xsp);
        int blocksizeJ = 1024;
        int[] curk = new int[Math.min(blocksizeI, ru - rl)];
        for (int bi = rl; bi < ru; bi += blocksizeI) {
            Arrays.fill(curk, 0);
            int bimin = Math.min(ru, bi + blocksizeI);
            for (int bk = 0; bk < cd; bk += blocksizeK) {
                int bkmin = Math.min(cd, bk + blocksizeK);
                for (int bj = 0; bj < n; bj += 1024) {
                    int bjlen = Math.min(n, bj + 1024) - bj;
                    for (int i = bi; i < bimin; ++i) {
                        int k;
                        int bn;
                        if (a.isEmpty(i)) continue;
                        int apos = a.pos(i);
                        int alen = a.size(i);
                        int[] aix = a.indexes(i);
                        double[] avals = a.values(i);
                        double[] cvals = c.values(i);
                        int cix = c.pos(i, bj);
                        int n2 = bn = b.isContiguous() ? alen % 4 : alen;
                        for (k = curk[i - bi] + apos; k < apos + bn && aix[k] < bkmin; ++k) {
                            LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k], bj), cix, bjlen);
                        }
                        double[] bvals = b.valuesAt(0);
                        while (k < apos + alen && aix[k] < bkmin) {
                            LibMatrixMult.vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], bvals, cvals, aix[k] * n + bj, aix[k + 1] * n + bj, aix[k + 2] * n + bj, aix[k + 3] * n + bj, cix, bjlen);
                            k += 4;
                        }
                        if (bj + bjlen != n) continue;
                        curk[i - bi] = k - apos;
                    }
                }
            }
        }
    }

    private static void matrixMultSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, boolean sparse, int rl, int ru) {
        SparseBlock a = m1.sparseBlock;
        SparseBlock b = m2.sparseBlock;
        int m = m1.rlen;
        int cd = m1.clen;
        int n = m2.clen;
        if (pm2 && m == 1) {
            LibMatrixMult.matrixMultSparseSparseVM(a, b, ret.getDenseBlock(), rl, ru);
        } else if (sparse) {
            ret.setNonZeros(LibMatrixMult.matrixMultSparseSparseSparseMM(a, b, ret.getSparseBlock(), n, rl, ru));
        } else if (m2.nonZeros < 2048L) {
            LibMatrixMult.matrixMultSparseSparseMMSmallRHS(a, b, ret.getDenseBlock(), rl, ru);
        } else {
            LibMatrixMult.matrixMultSparseSparseMM(a, b, ret.getDenseBlock(), m, cd, m1.nonZeros, rl, ru);
        }
    }

    private static void matrixMultSparseSparseVM(SparseBlock a, SparseBlock b, DenseBlock c, int rl, int ru) {
        if (a.isEmpty(0)) {
            return;
        }
        int alen = a.size(0);
        int[] aix = a.indexes(0);
        double[] avals = a.values(0);
        double[] cvals = c.valuesAt(0);
        int rlix = rl == 0 ? 0 : a.posFIndexGTE(0, rl);
        for (int k = rlix = rlix >= 0 ? rlix : alen; k < alen && aix[k] < ru; ++k) {
            if (b.isEmpty(aix[k])) continue;
            int bpos = b.pos(aix[k]);
            int blen = b.size(aix[k]);
            int[] bix = b.indexes(aix[k]);
            double[] bvals = b.values(aix[k]);
            LibMatrixMult.vectMultiplyAdd(avals[k], bvals, cvals, bix, bpos, 0, blen);
        }
    }

    private static long matrixMultSparseSparseSparseMM(SparseBlock a, SparseBlock b, SparseBlock c, int n, int rl, int ru) {
        double[] tmp = new double[n];
        long nnz = 0L;
        for (int i = rl; i < Math.min(ru, a.numRows()); ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            boolean hitNonEmpty = false;
            for (int k = apos; k < apos + alen; ++k) {
                int aixk = aix[k];
                if (b.isEmpty(aixk)) continue;
                LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aixk), tmp, b.indexes(aixk), b.pos(aixk), 0, b.size(aixk));
                hitNonEmpty = true;
            }
            if (!hitNonEmpty) continue;
            int rnnz = UtilFunctions.computeNnz(tmp, 0, n);
            nnz += (long)rnnz;
            c.allocate(i, rnnz);
            for (int j = 0; j < n; ++j) {
                if (tmp[j] == 0.0) continue;
                c.append(i, j, tmp[j]);
                tmp[j] = 0.0;
            }
        }
        return nnz;
    }

    private static void matrixMultSparseSparseMMSmallRHS(SparseBlock a, SparseBlock b, DenseBlock c, int rl, int ru) {
        for (int i = rl; i < Math.min(ru, a.numRows()); ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            double[] cvals = c.values(i);
            int cix = c.pos(i);
            for (int k = apos; k < apos + alen; ++k) {
                int aixk = aix[k];
                if (b.isEmpty(aixk)) continue;
                LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aixk), cvals, b.indexes(aixk), b.pos(aixk), cix, b.size(aixk));
            }
        }
    }

    private static void matrixMultSparseSparseMM(SparseBlock a, SparseBlock b, DenseBlock c, int m, int cd, long nnz1, int rl, int ru) {
        int blocksizeI = 32;
        int blocksizeK = Math.max(32, UtilFunctions.nextIntPow2((int)Math.pow((double)m * (double)cd / (double)nnz1, 2.0)));
        int[] curk = new int[Math.min(32, ru - rl)];
        for (int bi = rl; bi < ru; bi += 32) {
            Arrays.fill(curk, 0);
            int bimin = Math.min(ru, bi + 32);
            for (int bk = 0; bk < cd; bk += blocksizeK) {
                int bkmin = Math.min(cd, bk + blocksizeK);
                for (int i = bi; i < bimin; ++i) {
                    int k;
                    if (a.isEmpty(i)) continue;
                    int apos = a.pos(i);
                    int alen = a.size(i);
                    int[] aix = a.indexes(i);
                    double[] avals = a.values(i);
                    double[] cvals = c.values(i);
                    int cix = c.pos(i);
                    for (k = curk[i - bi] + apos; k < apos + alen && aix[k] < bkmin; ++k) {
                        if (b.isEmpty(aix[k])) continue;
                        LibMatrixMult.vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
                    }
                    curk[i - bi] = k - apos;
                }
            }
        }
    }

    private static void matrixMultSparseSparseMMGeneric(SparseBlock a, SparseBlock b, DenseBlock c, int rl, int ru) {
        for (int i = rl; i < Math.min(ru, a.numRows()); ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            double[] cvals = c.values(i);
            int cix = c.pos(i);
            for (int k = apos; k < apos + alen; ++k) {
                if (b.isEmpty(aix[k])) continue;
                double val = avals[k];
                int bpos = b.pos(aix[k]);
                int blen = b.size(aix[k]);
                int[] bix = b.indexes(aix[k]);
                double[] bvals = b.values(aix[k]);
                for (int j = bpos; j < bpos + blen; ++j) {
                    int n = cix + bix[j];
                    cvals[n] = cvals[n] + val * bvals[j];
                }
            }
        }
    }

    private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean m1Perm, int rl, int ru) {
        boolean leftUS;
        boolean bl = leftUS = m1.isUltraSparse() || m1.isUltraSparse(false) && !m2.isUltraSparse() || m1.sparse && !m2.sparse;
        if (m1 == m2) {
            LibMatrixMult.matrixMultUltraSparseSelf(m1, ret, rl, ru);
        } else if (leftUS || m1Perm) {
            LibMatrixMult.matrixMultUltraSparseLeft(m1, m2, ret, rl, ru);
        } else {
            LibMatrixMult.matrixMultUltraSparseRight(m1, m2, ret, rl, ru);
        }
    }

    private static void matrixMultUltraSparseSelf(MatrixBlock m1, MatrixBlock ret, int rl, int ru) {
        int n = m1.clen;
        SparseBlock a = m1.sparseBlock;
        SparseBlock c = ret.sparseBlock;
        double[] tmp = null;
        for (int i = rl; i < ru; ++i) {
            boolean ldense;
            if (a.isEmpty(i)) continue;
            int alen = a.size(i);
            int apos = a.pos(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            int nnz1 = (int)Math.min(UtilFunctions.computeNnz(a, aix, apos, alen), (long)n);
            boolean bl = ldense = nnz1 > n / 128;
            if (ldense) {
                tmp = tmp == null ? new double[n] : tmp;
                Arrays.fill(tmp, 0.0);
            }
            for (int k = apos; k < apos + alen; ++k) {
                int j;
                if (a.isEmpty(aix[k])) continue;
                int blen = a.size(aix[k]);
                int bpos = a.pos(aix[k]);
                int[] bix = a.indexes(aix[k]);
                double aval = avals[k];
                double[] bvals = a.values(aix[k]);
                if (ldense) {
                    for (j = bpos; j < bpos + blen; ++j) {
                        int n2 = bix[j];
                        tmp[n2] = tmp[n2] + aval * bvals[j];
                    }
                    continue;
                }
                c.allocate(i, nnz1);
                for (j = bpos; j < bpos + blen; ++j) {
                    c.add(i, bix[j], aval * bvals[j]);
                }
                c.compact(i);
            }
            if (!ldense) continue;
            int nnz2 = UtilFunctions.computeNnz(tmp, 0, n);
            c.allocate(i, nnz2);
            for (int j = 0; j < n; ++j) {
                c.append(i, j, tmp[j]);
            }
        }
        if (rl == 0 && ru == m1.rlen) {
            ret.recomputeNonZeros();
        }
    }

    private static void matrixMultUltraSparseLeft(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
        int m = m1.rlen;
        int n = m2.clen;
        SparseBlock a = m1.sparseBlock;
        SparseBlock c = ret.sparseBlock;
        boolean rightSparse = m2.sparse;
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i)) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aixs = a.indexes(i);
            double[] avals = a.values(i);
            if (alen == 1) {
                int aix = aixs[apos];
                int lnnz = 0;
                if (rightSparse) {
                    if (!m2.sparseBlock.isEmpty(aix)) {
                        ret.rlen = m;
                        ret.allocateSparseRowsBlock(false);
                        boolean ldeep = m2.sparseBlock instanceof SparseBlockMCSR;
                        ret.sparseBlock.set(i, m2.sparseBlock.get(aix), ldeep);
                        lnnz = ret.sparseBlock.size(i);
                        ret.nonZeros += (long)lnnz;
                    }
                } else {
                    lnnz = (int)m2.recomputeNonZeros(aix, aix, 0, n - 1);
                    if (lnnz > 0) {
                        c.allocate(i, lnnz);
                        double[] bvals = m2.getDenseBlock().values(aix);
                        int bix = m2.getDenseBlock().pos(aix);
                        for (int j = 0; j < n; ++j) {
                            c.append(i, j, bvals[bix + j]);
                        }
                        ret.nonZeros += (long)lnnz;
                    }
                }
                if (avals[apos] == 1.0 || lnnz <= 0) continue;
                if (c.get(i) instanceof SparseRowScalar) {
                    SparseRowScalar sv = (SparseRowScalar)c.get(i);
                    c.set(i, new SparseRowScalar(sv.getIndex(), sv.getValue() * avals[apos]), false);
                    continue;
                }
                LibMatrixMult.vectMultiplyInPlace(avals[apos], c.values(i), c.pos(i), c.size(i));
                continue;
            }
            for (int k = apos; k < apos + alen; ++k) {
                double aval = avals[k];
                int aix = aixs[k];
                for (int j = 0; j < n; ++j) {
                    double cval = ret.quickGetValue(i, j);
                    double cvald = aval * m2.quickGetValue(aix, j);
                    if (cvald == 0.0) continue;
                    ret.quickSetValue(i, j, cval + cvald);
                }
            }
        }
    }

    private static void matrixMultUltraSparseRight(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
        int cd = m1.clen;
        SparseBlock b = m2.sparseBlock;
        for (int k = 0; k < cd; ++k) {
            if (b.isEmpty(k)) continue;
            int bpos = b.pos(k);
            int blen = b.size(k);
            int[] bixs = b.indexes(k);
            double[] bvals = b.values(k);
            for (int j = bpos; j < bpos + blen; ++j) {
                double bval = bvals[j];
                int bix = bixs[j];
                for (int i = rl; i < ru; ++i) {
                    double cvald = bval * m1.quickGetValue(i, k);
                    if (cvald == 0.0) continue;
                    double cval = ret.quickGetValue(i, bix);
                    ret.quickSetValue(i, bix, cval + cvald);
                }
            }
        }
    }

    private static void matrixMultChainDense(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct, int rl, int ru) {
        DenseBlock a = mX.getDenseBlock();
        double[] b = mV.getDenseBlockValues();
        double[] w = mW != null ? mW.getDenseBlockValues() : null;
        double[] c = ret.getDenseBlockValues();
        int cd = mX.clen;
        boolean weights = ct == MapMultChain.ChainType.XtwXv;
        boolean weights2 = ct == MapMultChain.ChainType.XtXvy;
        int blocksizeI = 24;
        int blocksizeJ = 1024;
        double[] tmp = new double[24];
        int bn = (ru - rl) % 24;
        for (int i = rl; i < rl + bn; ++i) {
            double[] avals = a.values(i);
            int aix = a.pos(i);
            double val = b == null ? 0.0 : LibMatrixMult.dotProduct(avals, b, aix, 0, cd);
            val *= weights ? w[i] : 1.0;
            LibMatrixMult.vectMultiplyAdd(val -= weights2 ? w[i] : 0.0, avals, c, aix, 0, cd);
        }
        for (int bi = rl + bn; bi < ru; bi += 24) {
            int bjmin;
            int bj;
            Arrays.fill(tmp, 0.0);
            if (b != null) {
                for (bj = 0; bj < cd; bj += 1024) {
                    bjmin = Math.min(cd - bj, 1024);
                    for (int i = 0; i < 24; ++i) {
                        int n = i;
                        tmp[n] = tmp[n] + LibMatrixMult.dotProduct(a.values(bi + i), b, a.pos(bi + i, bj), bj, bjmin);
                    }
                }
            }
            if (weights) {
                LibMatrixMult.vectMultiply(w, tmp, bi, 0, 24);
            } else if (weights2) {
                LibMatrixMult.vectSubtract(w, tmp, bi, 0, 24);
            }
            for (bj = 0; bj < cd; bj += 1024) {
                bjmin = Math.min(cd - bj, 1024);
                if (a.isContiguous()) {
                    double[] avals = a.values(0);
                    int i = 0;
                    int aix = bi * cd + bj;
                    while (i < 24) {
                        LibMatrixMult.vectMultiplyAdd4(tmp[i], tmp[i + 1], tmp[i + 2], tmp[i + 3], avals, c, aix, aix + cd, aix + 2 * cd, aix + 3 * cd, bj, bjmin);
                        i += 4;
                        aix += 4 * cd;
                    }
                    continue;
                }
                for (int i = 0; i < 24; ++i) {
                    LibMatrixMult.vectMultiplyAdd(tmp[i], a.values(bi + i), c, a.pos(bi + i, bj), bj, bjmin);
                }
            }
        }
    }

    private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, MapMultChain.ChainType ct, int rl, int ru) {
        SparseBlock a = mX.sparseBlock;
        double[] b = mV.getDenseBlockValues();
        double[] w = mW != null ? mW.getDenseBlockValues() : null;
        double[] c = ret.getDenseBlockValues();
        boolean weights = ct == MapMultChain.ChainType.XtwXv;
        boolean weights2 = ct == MapMultChain.ChainType.XtXvy;
        for (int i = rl; i < ru; ++i) {
            if (a.isEmpty(i) || weights && w[i] == 0.0) continue;
            int apos = a.pos(i);
            int alen = a.size(i);
            int[] aix = a.indexes(i);
            double[] avals = a.values(i);
            double val = b == null ? 0.0 : LibMatrixMult.dotProduct(avals, b, aix, apos, 0, alen);
            val *= weights ? w[i] : 1.0;
            if ((val -= weights2 ? w[i] : 0.0) == 0.0) continue;
            LibMatrixMult.vectMultiplyAdd(val, avals, c, aix, apos, 0, alen);
        }
    }

    private static void matrixMultTransposeSelfDense(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru) {
        DenseBlock a = m1.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        int m = m1.rlen;
        int n = m1.clen;
        if (leftTranspose) {
            if (n == 1) {
                double[] avals = a.valuesAt(0);
                c.set(0, 0, LibMatrixMult.dotProduct(avals, avals, m));
            } else {
                int blocksizeI = 32;
                int blocksizeK = 24;
                int blocksizeJ = 1024;
                double[] ta = new double[24];
                int[] tbi = new int[24];
                int mx = ru;
                int cdx = m;
                int nx = n;
                for (int bi = rl; bi < mx; bi += 32) {
                    int bimin = Math.min(mx, bi + 32);
                    for (int bk = 0; bk < cdx; bk += 24) {
                        int bkmin = Math.min(cdx, bk + 24);
                        for (int bj = bi; bj < nx; bj += 1024) {
                            int bklen = bkmin - bk;
                            int bjlen = Math.min(nx, bj + 1024) - bj;
                            for (int i = bi; i < bimin; ++i) {
                                double[] cvals = c.values(i);
                                int cixj = c.pos(i, bj);
                                if (a.isContiguous(bk, bkmin - 1)) {
                                    double[] avals = a.values(bk);
                                    int aixi = a.pos(bk, i);
                                    int bkpos = a.pos(bk, bj);
                                    int knnz = LibMatrixMult.copyNonZeroElements(avals, aixi, bkpos, n, nx, ta, tbi, bklen);
                                    int bn = knnz % 4;
                                    switch (bn) {
                                        case 1: {
                                            LibMatrixMult.vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen);
                                            break;
                                        }
                                        case 2: {
                                            LibMatrixMult.vectMultiplyAdd2(ta[0], ta[1], avals, cvals, tbi[0], tbi[1], cixj, bjlen);
                                            break;
                                        }
                                        case 3: {
                                            LibMatrixMult.vectMultiplyAdd3(ta[0], ta[1], ta[2], avals, cvals, tbi[0], tbi[1], tbi[2], cixj, bjlen);
                                        }
                                    }
                                    for (int k = bn; k < knnz; k += 4) {
                                        LibMatrixMult.vectMultiplyAdd4(ta[k], ta[k + 1], ta[k + 2], ta[k + 3], avals, cvals, tbi[k], tbi[k + 1], tbi[k + 2], tbi[k + 3], cixj, bjlen);
                                    }
                                    continue;
                                }
                                for (int k = bk; k < bkmin; ++k) {
                                    int aix;
                                    double[] avals = a.values(bk);
                                    if (avals[aix = a.pos(bk, i)] == 0.0) continue;
                                    LibMatrixMult.vectMultiplyAdd(avals[aix], a.values(k), cvals, a.pos(k, bj), cixj, bjlen);
                                }
                            }
                        }
                    }
                }
            }
        } else if (m == 1) {
            double[] avals = a.valuesAt(0);
            c.set(0, 0, LibMatrixMult.dotProduct(avals, avals, n));
        } else {
            int blocksizeK = 1024;
            int blocksizeIJ = 32768 / blocksizeK / 2 - 1;
            for (int bi = rl; bi < ru; bi += blocksizeIJ) {
                int bimin = Math.min(ru, bi + blocksizeIJ);
                for (int bk = 0; bk < n; bk += blocksizeK) {
                    int bklen = Math.min(blocksizeK, n - bk);
                    for (int bj = bi; bj < m; bj += blocksizeIJ) {
                        int bjmin = Math.min(m, bj + blocksizeIJ);
                        for (int i = bi; i < bimin; ++i) {
                            int bjmax = Math.max(i, bj);
                            double[] avals = a.values(i);
                            double[] cvals = c.values(i);
                            int aix = a.pos(i, bk);
                            int cix = c.pos(i);
                            for (int j = bjmax; j < bjmin; ++j) {
                                int n2 = cix + j;
                                cvals[n2] = cvals[n2] + LibMatrixMult.dotProduct(avals, a.values(j), aix, a.pos(j, bk), bklen);
                            }
                        }
                    }
                }
            }
        }
    }

    private static void matrixMultTransposeSelfSparse(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru) {
        block7: {
            int m;
            DenseBlock c;
            SparseBlock a;
            block8: {
                block6: {
                    a = m1.sparseBlock;
                    c = ret.getDenseBlock();
                    m = m1.rlen;
                    if (!leftTranspose) break block6;
                    int n = m1.clen;
                    int arlen = a.numRows();
                    for (int r = 0; r < arlen; ++r) {
                        if (a.isEmpty(r)) continue;
                        int alen = a.size(r);
                        double[] avals = a.values(r);
                        if (alen == n) {
                            for (int i = rl; i < ru; ++i) {
                                LibMatrixMult.vectMultiplyAdd(avals[i], avals, c.values(i), i, c.pos(i) + i, n - i);
                            }
                            continue;
                        }
                        int apos = a.pos(r);
                        int[] aix = a.indexes(r);
                        int rlix = rl == 0 ? 0 : a.posFIndexGTE(r, rl);
                        rlix = rlix >= 0 ? apos + rlix : apos + alen;
                        int len = apos + alen;
                        for (int i = rlix; i < len && aix[i] < ru; ++i) {
                            double val = avals[i];
                            if (val == 0.0) continue;
                            LibMatrixMult.vectMultiplyAdd(val, avals, c.values(aix[i]), aix, i, c.pos(aix[i]), len - i);
                        }
                    }
                    break block7;
                }
                if (m != 1) break block8;
                if (m1.sparseBlock.isEmpty(0)) break block7;
                int alen = m1.sparseBlock.size(0);
                double[] avals = a.values(0);
                c.set(0, 0, LibMatrixMult.dotProduct(avals, avals, alen));
                break block7;
            }
            m = m1.clen;
            int arlen = a.numRows();
            for (int r = 0; r < arlen; ++r) {
                if (a.isEmpty(r)) continue;
                int apos = a.pos(r);
                int alen = a.size(r);
                int[] aix = a.indexes(r);
                double[] avals = a.values(r);
                int rlix = rl == 0 ? 0 : a.posFIndexGTE(r, rl);
                for (int i = rlix = rlix >= 0 ? apos + rlix : apos + alen; i < apos + alen && aix[i] < ru; ++i) {
                    double val = avals[i];
                    if (val == 0.0) continue;
                    LibMatrixMult.vectMultiplyAdd(val, avals, c.values(aix[i]), aix, i, c.pos(aix[i]), alen - i);
                }
            }
        }
    }

    private static void matrixMultPermuteDense(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
        double[] a = pm1.getDenseBlockValues();
        DenseBlock b = m2.getDenseBlock();
        DenseBlock c = ret1.getDenseBlock();
        int n = m2.clen;
        int blen = ret1.getNumRows();
        int lastblk = -1;
        for (int i = rl; i < ru; ++i) {
            int pos = UtilFunctions.toInt(a[i]);
            if (pos <= 0) continue;
            int bpos = (pos - 1) % blen;
            int blk = (pos - 1) / blen;
            if (lastblk != -1 && lastblk < blk) {
                ret2.sparse = false;
                ret2.allocateDenseBlock();
                c = ret2.getDenseBlock();
            }
            System.arraycopy(b.values(i), b.pos(i), c.values(bpos), c.pos(bpos), n);
            lastblk = blk;
        }
    }

    private static void matrixMultPermuteDenseSparse(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
        double[] a = pm1.getDenseBlockValues();
        DenseBlock b = m2.getDenseBlock();
        SparseBlock c = ret1.sparseBlock;
        int n = m2.clen;
        int blen = ret1.getNumRows();
        int lastblk = -1;
        for (int i = rl; i < ru; ++i) {
            int pos = UtilFunctions.toInt(a[i]);
            if (pos <= 0) continue;
            double[] bvals = b.values(i);
            int bix = b.pos(i);
            int bpos = (pos - 1) % blen;
            int blk = (pos - 1) / blen;
            if (lastblk != -1 && lastblk < blk) {
                ret2.sparse = true;
                ret2.rlen = ret1.rlen;
                ret2.allocateSparseRowsBlock();
                c = ret2.sparseBlock;
            }
            for (int j = 0; j < n; ++j) {
                c.append(bpos, j, bvals[bix + j]);
            }
            lastblk = blk;
        }
    }

    private static void matrixMultPermuteSparse(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
        double[] a = pm1.getDenseBlockValues();
        SparseBlock b = m2.sparseBlock;
        SparseBlock c = ret1.sparseBlock;
        int blen = ret1.getNumRows();
        int lastblk = -1;
        for (int i = rl; i < ru; ++i) {
            int pos = UtilFunctions.toInt(a[i]);
            if (pos <= 0) continue;
            int bpos = (pos - 1) % blen;
            int blk = (pos - 1) / blen;
            if (lastblk != -1 && lastblk < blk) {
                ret2.sparse = true;
                ret2.allocateSparseRowsBlock();
                c = ret2.sparseBlock;
            }
            c.set(bpos, b.get(i), true);
            lastblk = blk;
        }
    }

    private static void matrixMultWSLossDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
        DenseBlock x = mX.getDenseBlock();
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        DenseBlock w = mW != null ? mW.getDenseBlock() : null;
        int n = mX.clen;
        int cd = mU.clen;
        double wsloss = 0.0;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int j;
                int uix;
                double[] uvals;
                double[] xvals;
                double uvij;
                double wij;
                int j2;
                int uix2;
                int xix;
                double[] xvals2;
                double[] wvals;
                int i;
                int bjmin = Math.min(n, bj + 16);
                if (wt == WeightedSquaredLoss.WeightsType.POST) {
                    for (i = bi; i < bimin; ++i) {
                        wvals = w.values(i);
                        xvals2 = x.values(i);
                        double[] uvals2 = u.values(i);
                        xix = x.pos(i);
                        uix2 = u.pos(i);
                        for (j2 = bj; j2 < bjmin; ++j2) {
                            wij = wvals[xix + j2];
                            if (wij == 0.0) continue;
                            uvij = LibMatrixMult.dotProduct(uvals2, v.values(j2), uix2, v.pos(j2), cd);
                            wsloss += wij * (xvals2[xix + j2] - uvij) * (xvals2[xix + j2] - uvij);
                        }
                    }
                    continue;
                }
                if (wt == WeightedSquaredLoss.WeightsType.POST_NZ) {
                    for (i = bi; i < bimin; ++i) {
                        xvals = x.values(i);
                        uvals = u.values(i);
                        int xix2 = x.pos(i);
                        uix = u.pos(i);
                        for (j = bj; j < bjmin; ++j) {
                            double xij = xvals[xix2 + j];
                            if (xij == 0.0) continue;
                            double uvij2 = LibMatrixMult.dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
                            wsloss += (xij - uvij2) * (xij - uvij2);
                        }
                    }
                    continue;
                }
                if (wt == WeightedSquaredLoss.WeightsType.PRE) {
                    for (i = bi; i < bimin; ++i) {
                        wvals = w.values(i);
                        xvals2 = x.values(i);
                        double[] uvals3 = u.values(i);
                        xix = x.pos(i);
                        uix2 = u.pos(i);
                        for (j2 = bj; j2 < bjmin; ++j2) {
                            wij = wvals[xix + j2];
                            uvij = 0.0;
                            if (wij != 0.0) {
                                uvij = LibMatrixMult.dotProduct(uvals3, v.values(j2), uix2, v.pos(j2), cd);
                            }
                            wsloss += (xvals2[xix + j2] - wij * uvij) * (xvals2[xix + j2] - wij * uvij);
                        }
                    }
                    continue;
                }
                if (wt != WeightedSquaredLoss.WeightsType.NONE) continue;
                for (i = bi; i < bimin; ++i) {
                    xvals = x.values(i);
                    uvals = u.values(i);
                    int xix3 = x.pos(i);
                    uix = u.pos(i);
                    for (j = bj; j < bjmin; ++j) {
                        double uvij3 = LibMatrixMult.dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
                        wsloss += (xvals[xix3 + j] - uvij3) * (xvals[xix3 + j] - uvij3);
                    }
                }
            }
        }
        ret.quickSetValue(0, 0, wsloss);
    }

    private static void matrixMultWSLossSparseDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
        SparseBlock x = mX.sparseBlock;
        SparseBlock w = mW != null ? mW.sparseBlock : null;
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int n = mX.clen;
        int cd = mU.clen;
        double wsloss = 0.0;
        if (wt == WeightedSquaredLoss.WeightsType.POST) {
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                double[] uvals = u.values(i);
                int uix = u.pos(i);
                if (w.isAligned(i, x)) {
                    double[] xval = x.values(i);
                    for (int k = wpos; k < wpos + wlen; ++k) {
                        double uvij = LibMatrixMult.dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd);
                        wsloss += wval[k] * (xval[k] - uvij) * (xval[k] - uvij);
                    }
                    continue;
                }
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double xi = mX.quickGetValue(i, wix[k]);
                    double uvij = LibMatrixMult.dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd);
                    wsloss += wval[k] * (xi - uvij) * (xi - uvij);
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.POST_NZ) {
            int blocksizeIJ = (int)(8L * (long)mX.rlen * (long)mX.clen / mX.nonZeros);
            int[] curk = new int[blocksizeIJ];
            for (int bi = rl; bi < ru; bi += blocksizeIJ) {
                int bimin = Math.min(ru, bi + blocksizeIJ);
                Arrays.fill(curk, 0);
                for (int bj = 0; bj < n; bj += blocksizeIJ) {
                    int bjmin = Math.min(n, bj + blocksizeIJ);
                    for (int i = bi; i < bimin; ++i) {
                        int k;
                        if (x.isEmpty(i)) continue;
                        int xpos = x.pos(i);
                        int xlen = x.size(i);
                        int[] xix = x.indexes(i);
                        double[] xval = x.values(i);
                        double[] uvals = u.values(i);
                        int uix = u.pos(i);
                        for (k = xpos + curk[i - bi]; k < xpos + xlen && xix[k] < bjmin; ++k) {
                            double uvij = LibMatrixMult.dotProduct(uvals, v.values(xix[k]), uix, v.pos(xix[k]), cd);
                            wsloss += (xval[k] - uvij) * (xval[k] - uvij);
                        }
                        curk[i - bi] = k - xpos;
                    }
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.PRE) {
            for (int i = rl; i < ru; ++i) {
                double[] uvals = u.values(i);
                int uix = u.pos(i);
                for (int j = 0; j < n; ++j) {
                    double xij = mX.quickGetValue(i, j);
                    double wij = mW.quickGetValue(i, j);
                    double uvij = 0.0;
                    if (wij != 0.0) {
                        uvij = LibMatrixMult.dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
                    }
                    wsloss += (xij - wij * uvij) * (xij - wij * uvij);
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.NONE) {
            int blocksizeIJ = (int)(8L * (long)mX.rlen * (long)mX.clen / mX.nonZeros);
            int[] curk = new int[blocksizeIJ];
            for (int bi = rl; bi < ru; bi += blocksizeIJ) {
                int bimin = Math.min(ru, bi + blocksizeIJ);
                Arrays.fill(curk, 0);
                for (int bj = 0; bj < n; bj += blocksizeIJ) {
                    int bjmin = Math.min(n, bj + blocksizeIJ);
                    for (int i = bi; i < bimin; ++i) {
                        int k;
                        if (x.isEmpty(i)) continue;
                        int xpos = x.pos(i);
                        int xlen = x.size(i);
                        int[] xix = x.indexes(i);
                        double[] xval = x.values(i);
                        double[] uvals = u.values(i);
                        int uix = u.pos(i);
                        for (k = xpos + curk[i - bi]; k < xpos + xlen && xix[k] < bjmin; ++k) {
                            double xij = xval[k];
                            double uvij = LibMatrixMult.dotProduct(uvals, v.values(xix[k]), uix, v.pos(xix[k]), cd);
                            wsloss += xij * xij - 2.0 * xij * uvij;
                        }
                        curk[i - bi] = k - xpos;
                    }
                }
            }
        }
        ret.quickSetValue(0, 0, wsloss);
    }

    private static void matrixMultWSLossGeneric(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
        int n = mX.clen;
        int cd = mU.clen;
        double wsloss = 0.0;
        if (wt == WeightedSquaredLoss.WeightsType.POST) {
            if (mW.sparse) {
                SparseBlock w = mW.sparseBlock;
                for (int i = rl; i < ru; ++i) {
                    if (w.isEmpty(i)) continue;
                    int wpos = w.pos(i);
                    int wlen = w.size(i);
                    int[] wix = w.indexes(i);
                    double[] wval = w.values(i);
                    for (int k = wpos; k < wpos + wlen; ++k) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, wix[k], cd);
                        double xi = mX.quickGetValue(i, wix[k]);
                        wsloss += wval[k] * (xi - uvij) * (xi - uvij);
                    }
                }
            } else {
                DenseBlock w = mW.getDenseBlock();
                for (int i = rl; i < ru; ++i) {
                    double[] wvals = w.values(i);
                    int wix = w.pos(i);
                    for (int j = 0; j < n; ++j) {
                        if (wvals[wix + j] == 0.0) continue;
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        double xij = mX.quickGetValue(i, j);
                        wsloss += wvals[wix + j] * (xij - uvij) * (xij - uvij);
                    }
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.POST_NZ) {
            if (mX.sparse) {
                SparseBlock x = mX.sparseBlock;
                for (int i = rl; i < ru; ++i) {
                    if (x.isEmpty(i)) continue;
                    int xpos = x.pos(i);
                    int xlen = x.size(i);
                    int[] xix = x.indexes(i);
                    double[] xval = x.values(i);
                    for (int k = xpos; k < xpos + xlen; ++k) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, xix[k], cd);
                        wsloss += (xval[k] - uvij) * (xval[k] - uvij);
                    }
                }
            } else {
                DenseBlock x = mX.getDenseBlock();
                for (int i = rl; i < ru; ++i) {
                    double[] xvals = x.values(i);
                    int xix = x.pos(i);
                    for (int j = 0; j < n; ++j) {
                        double xij = xvals[xix + j];
                        if (xij == 0.0) continue;
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        wsloss += (xij - uvij) * (xij - uvij);
                    }
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.PRE) {
            for (int i = rl; i < ru; ++i) {
                for (int j = 0; j < n; ++j) {
                    double xij = mX.quickGetValue(i, j);
                    double wij = mW.quickGetValue(i, j);
                    double uvij = 0.0;
                    if (wij != 0.0) {
                        uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                    }
                    wsloss += (xij - wij * uvij) * (xij - wij * uvij);
                }
            }
        } else if (wt == WeightedSquaredLoss.WeightsType.NONE) {
            if (mX.sparse) {
                SparseBlock x = mX.sparseBlock;
                for (int i = rl; i < ru; ++i) {
                    if (x.isEmpty(i)) continue;
                    int xpos = x.pos(i);
                    int xlen = x.size(i);
                    int[] xix = x.indexes(i);
                    double[] xval = x.values(i);
                    for (int k = xpos; k < xpos + xlen; ++k) {
                        double xij = xval[k];
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, xix[k], cd);
                        wsloss += xij * xij - 2.0 * xij * uvij;
                    }
                }
            } else {
                DenseBlock x = mX.getDenseBlock();
                for (int i = rl; i < ru; ++i) {
                    double[] xvals = x.values(i);
                    int xix = x.pos(i);
                    for (int j = 0; j < n; ++j) {
                        if (xvals[xix + j] == 0.0) continue;
                        double xij = xvals[xix + j];
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        wsloss += xij * xij - 2.0 * xij * uvij;
                    }
                }
            }
        }
        ret.quickSetValue(0, 0, wsloss);
    }

    private static void addMatrixMultWSLossNoWeightCorrection(MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, int k) {
        MatrixBlock tmp1 = new MatrixBlock(mU.clen, mU.clen, false);
        MatrixBlock tmp2 = new MatrixBlock(mU.clen, mU.clen, false);
        LibMatrixMult.matrixMultTransposeSelf(mU, tmp1, true, k);
        LibMatrixMult.matrixMultTransposeSelf(mV, tmp2, true, k);
        ret.quickSetValue(0, 0, ret.quickGetValue(0, 0) + (tmp1.sparse || tmp2.sparse ? LibMatrixMult.dotProductGeneric(tmp1, tmp2) : LibMatrixMult.dotProduct(tmp1.getDenseBlockValues(), tmp2.getDenseBlockValues(), mU.clen * mU.clen)));
    }

    private static void matrixMultWSigmoidDenseNative(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt) {
        double[] w = mW.getDenseBlockValues();
        double[] c = ret.getDenseBlockValues();
        int m = mW.rlen;
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        long nnz = NativeHelper.dmmdd((m == 1 ? mV : mU).getDenseBlockValues(), (m == 1 ? mU : mV).getDenseBlockValues(), c, m == 1 ? n : m, cd, 1, 1);
        if (nnz < 0L) {
            LOG.warn((Object)"matrixMultWSigmoidDenseNative: Native mat mult failed. Falling back to java version.");
            LibMatrixMult.matrixMult(m == 1 ? mV : mU, m == 1 ? mU : mV, ret, false);
        }
        for (int i = 0; i < m * n; ++i) {
            double cval = flagminus ? 1.0 / (1.0 + FastMath.exp((double)c[i])) : 1.0 / (1.0 + FastMath.exp((double)(-c[i])));
            c[i] = w[i] * (flaglog ? Math.log(cval) : cval);
        }
    }

    private static void matrixMultWSigmoidDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) {
        DenseBlock w = mW.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int bjmin = Math.min(n, bj + 16);
                for (int i = bi; i < bimin; ++i) {
                    double[] wvals = w.values(i);
                    double[] uvals = u.values(i);
                    double[] cvals = c.values(i);
                    int wix = w.pos(i);
                    int uix = u.pos(i);
                    for (int j = bj; j < bjmin; ++j) {
                        double wij = wvals[wix + j];
                        if (wij == 0.0) continue;
                        cvals[wix + j] = LibMatrixMult.wsigmoid(wij, uvals, v.values(j), uix, v.pos(j), flagminus, flaglog, cd);
                    }
                }
            }
        }
    }

    private static void matrixMultWSigmoidSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) {
        SparseBlock w = mW.sparseBlock;
        SparseBlock c = ret.sparseBlock;
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        for (int i = rl; i < ru; ++i) {
            if (w.isEmpty(i)) continue;
            int wpos = w.pos(i);
            int wlen = w.size(i);
            int[] wix = w.indexes(i);
            double[] wval = w.values(i);
            double[] uvals = u.values(i);
            int uix = u.pos(i);
            c.allocate(i, wlen);
            for (int k = wpos; k < wpos + wlen; ++k) {
                double cval = LibMatrixMult.wsigmoid(wval[k], uvals, v.values(wix[k]), uix, v.pos(wix[k]), flagminus, flaglog, cd);
                c.append(i, wix[k], cval);
            }
        }
    }

    private static void matrixMultWSigmoidGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) {
        boolean flaglog;
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagminus = wt == WeightedSigmoid.WSigmoidType.MINUS || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        boolean bl = flaglog = wt == WeightedSigmoid.WSigmoidType.LOG || wt == WeightedSigmoid.WSigmoidType.LOG_MINUS;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            SparseBlock c = ret.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                c.allocate(i, wlen);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double cval = LibMatrixMult.wsigmoid(wval[k], mU, mV, i, wix[k], flagminus, flaglog, cd);
                    c.append(i, wix[k], cval);
                }
            }
        } else {
            DenseBlock w = mW.getDenseBlock();
            DenseBlock c = ret.getDenseBlock();
            for (int i = rl; i < ru; ++i) {
                double[] wvals = w.values(i);
                double[] cvals = c.values(i);
                int ix = w.pos(i);
                for (int j = 0; j < n; ++j) {
                    double wij = wvals[ix + j];
                    if (wij == 0.0) continue;
                    cvals[ix + j] = LibMatrixMult.wsigmoid(wij, mU, mV, i, j, flagminus, flaglog, cd);
                }
            }
        }
    }

    private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) {
        boolean basic = wt.isBasic();
        boolean left = wt.isLeft();
        boolean mult = wt.isMult();
        boolean minus = wt.isMinus();
        boolean four = wt.hasFourInputs();
        boolean scalar = wt.hasScalar();
        double eps = scalar ? mX.quickGetValue(0, 0) : 0.0;
        int cd = mU.clen;
        DenseBlock w = mW.getDenseBlock();
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        DenseBlock x = mX == null ? null : mX.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = cl; bj < cu; bj += 16) {
                int bjmin = Math.min(cu, bj + 16);
                for (int i = bi; i < bimin; ++i) {
                    double[] wvals = w.values(i);
                    double[] uvals = u.values(i);
                    double[] xvals = four ? x.values(i) : null;
                    int wix = w.pos(i);
                    int uix = u.pos(i);
                    for (int j = bj; j < bjmin; ++j) {
                        if (wvals[wix + j] == 0.0) continue;
                        double[] cvals = c.values(basic || !left ? i : j);
                        if (basic) {
                            cvals[wix + j] = wvals[wix + j] * LibMatrixMult.dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
                            continue;
                        }
                        if (four) {
                            if (scalar) {
                                LibMatrixMult.wdivmm(wvals[wix + j], eps, uvals, v.values(j), cvals, uix, v.pos(j), left, scalar, cd);
                                continue;
                            }
                            LibMatrixMult.wdivmm(wvals[wix + j], xvals[wix + j], uvals, v.values(j), cvals, uix, v.pos(j), left, scalar, cd);
                            continue;
                        }
                        LibMatrixMult.wdivmm(wvals[wix + j], uvals, v.values(j), cvals, uix, v.pos(j), left, mult, minus, cd);
                    }
                }
            }
        }
    }

    private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) {
        boolean basic = wt.isBasic();
        boolean left = wt.isLeft();
        boolean mult = wt.isMult();
        boolean minus = wt.isMinus();
        boolean four = wt.hasFourInputs();
        boolean scalar = wt.hasScalar();
        double eps = scalar ? mX.quickGetValue(0, 0) : 0.0;
        int cd = mU.clen;
        SparseBlock w = mW.sparseBlock;
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        SparseBlock x = mX == null ? null : mX.sparseBlock;
        int blocksizeI = (int)(8L * (long)mW.rlen * (long)mW.clen / mW.nonZeros);
        int blocksizeJ = left ? Math.max(8, Math.min(262144 / (mU.clen * 8), blocksizeI)) : blocksizeI;
        int[] curk = new int[blocksizeI];
        boolean[] aligned = four && !scalar ? new boolean[blocksizeI] : null;
        for (int bi = rl; bi < ru; bi += blocksizeI) {
            int i;
            int bimin = Math.min(ru, bi + blocksizeI);
            for (i = bi; i < bimin; ++i) {
                int k = cl == 0 || w.isEmpty(i) ? 0 : w.posFIndexGTE(i, cl);
                curk[i - bi] = k >= 0 ? k : mW.clen;
            }
            if (four && !scalar) {
                for (i = bi; i < bimin; ++i) {
                    aligned[i - bi] = w.isAligned(i - bi, x);
                }
            }
            for (int bj = cl; bj < cu; bj += blocksizeJ) {
                int bjmin = Math.min(cu, bj + blocksizeJ);
                for (int i2 = bi; i2 < bimin; ++i2) {
                    double[] cvals;
                    int k;
                    if (w.isEmpty(i2)) continue;
                    int wpos = w.pos(i2);
                    int wlen = w.size(i2);
                    int[] wix = w.indexes(i2);
                    double[] wval = w.values(i2);
                    double[] uvals = u.values(i2);
                    int uix = u.pos(i2);
                    if (basic) {
                        for (k = wpos + curk[i2 - bi]; k < wpos + wlen && wix[k] < bjmin; ++k) {
                            ret.appendValue(i2, wix[k], wval[k] * LibMatrixMult.dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd));
                        }
                    } else if (four) {
                        if (!scalar && w.isAligned(i2, x)) {
                            double[] xvals = x.values(i2);
                            while (k < wpos + wlen && wix[k] < bjmin) {
                                double[] cvals2 = c.values(left ? wix[k] : i2);
                                LibMatrixMult.wdivmm(wval[k], xvals[k], uvals, v.values(wix[k]), cvals2, uix, v.pos(wix[k]), left, scalar, cd);
                                ++k;
                            }
                        } else {
                            while (k < wpos + wlen && wix[k] < bjmin) {
                                cvals = c.values(left ? wix[k] : i2);
                                if (scalar) {
                                    LibMatrixMult.wdivmm(wval[k], eps, uvals, v.values(wix[k]), cvals, uix, v.pos(wix[k]), left, scalar, cd);
                                } else {
                                    LibMatrixMult.wdivmm(wval[k], x.get(i2, wix[k]), uvals, v.values(wix[k]), cvals, uix, v.pos(wix[k]), left, scalar, cd);
                                }
                                ++k;
                            }
                        }
                    } else {
                        while (k < wpos + wlen && wix[k] < bjmin) {
                            cvals = c.values(left ? wix[k] : i2);
                            LibMatrixMult.wdivmm(wval[k], uvals, v.values(wix[k]), cvals, uix, v.pos(wix[k]), left, mult, minus, cd);
                            ++k;
                        }
                    }
                    curk[i2 - bi] = k - wpos;
                }
            }
        }
    }

    private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) {
        boolean basic = wt.isBasic();
        boolean left = wt.isLeft();
        boolean mult = wt.isMult();
        boolean minus = wt.isMinus();
        boolean four = wt.hasFourInputs();
        boolean scalar = wt.hasScalar();
        double eps = scalar ? mX.quickGetValue(0, 0) : 0.0;
        int cd = mU.clen;
        DenseBlock c = ret.getDenseBlock();
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                int k = cl == 0 ? 0 : w.posFIndexGTE(i, cl);
                int n = k = k >= 0 ? wpos + k : wpos + wlen;
                while (k < wpos + wlen && wix[k] < cu) {
                    double[] cvals = c.values(basic || !left ? i : wix[k]);
                    if (basic) {
                        double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, wix[k], cd);
                        ret.appendValue(i, wix[k], uvij);
                    } else if (four) {
                        double xij = scalar ? eps : mX.quickGetValue(i, wix[k]);
                        LibMatrixMult.wdivmm(wval[k], xij, mU, mV, cvals, i, wix[k], left, scalar, cd);
                    } else {
                        LibMatrixMult.wdivmm(wval[k], mU, mV, cvals, i, wix[k], left, mult, minus, cd);
                    }
                    ++k;
                }
            }
        } else {
            DenseBlock w = mW.getDenseBlock();
            for (int i = rl; i < ru; ++i) {
                double[] wvals = w.values(i);
                int ix = w.pos(i);
                for (int j = cl; j < cu; ++j) {
                    if (wvals[ix + j] == 0.0) continue;
                    double[] cvals = c.values(basic || !left ? i : j);
                    if (basic) {
                        cvals[ix + j] = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                        continue;
                    }
                    if (four) {
                        double xij = scalar ? eps : mX.quickGetValue(i, j);
                        LibMatrixMult.wdivmm(wvals[ix + j], xij, mU, mV, cvals, i, j, left, scalar, cd);
                        continue;
                    }
                    LibMatrixMult.wdivmm(wvals[ix + j], mU, mV, cvals, i, j, left, mult, minus, cd);
                }
            }
        }
    }

    private static void matrixMultWCeMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
        DenseBlock w = mW.getDenseBlock();
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int n = mW.clen;
        int cd = mU.clen;
        double wceval = 0.0;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int bjmin = Math.min(n, bj + 16);
                for (int i = bi; i < bimin; ++i) {
                    double[] wvals = w.values(i);
                    double[] uvals = u.values(i);
                    int wix = w.pos(i);
                    int uix = u.pos(i);
                    for (int j = bj; j < bjmin; ++j) {
                        double wij = wvals[wix + j];
                        if (wij == 0.0) continue;
                        double uvij = LibMatrixMult.dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
                        wceval += wij * Math.log(uvij + eps);
                    }
                }
            }
        }
        ret.quickSetValue(0, 0, wceval);
    }

    private static void matrixMultWCeMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
        SparseBlock w = mW.sparseBlock;
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int n = mW.clen;
        int cd = mU.clen;
        double wceval = 0.0;
        int blocksizeIJ = (int)(8L * (long)mW.rlen * (long)mW.clen / mW.nonZeros);
        int[] curk = new int[blocksizeIJ];
        for (int bi = rl; bi < ru; bi += blocksizeIJ) {
            int bimin = Math.min(ru, bi + blocksizeIJ);
            Arrays.fill(curk, 0);
            for (int bj = 0; bj < n; bj += blocksizeIJ) {
                int bjmin = Math.min(n, bj + blocksizeIJ);
                for (int i = bi; i < bimin; ++i) {
                    int k;
                    if (w.isEmpty(i)) continue;
                    int wpos = w.pos(i);
                    int wlen = w.size(i);
                    int[] wix = w.indexes(i);
                    double[] wvals = w.values(i);
                    double[] uvals = u.values(i);
                    int uix = u.pos(i);
                    for (k = wpos + curk[i - bi]; k < wpos + wlen && wix[k] < bjmin; ++k) {
                        double uvij = LibMatrixMult.dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd);
                        wceval += wvals[k] * Math.log(uvij + eps);
                    }
                    curk[i - bi] = k - wpos;
                }
            }
        }
        ret.quickSetValue(0, 0, wceval);
    }

    private static void matrixMultWCeMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
        int n = mW.clen;
        int cd = mU.clen;
        double wceval = 0.0;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, wix[k], cd);
                    wceval += wval[k] * Math.log(uvij + eps);
                }
            }
        } else {
            DenseBlock w = mW.getDenseBlock();
            for (int i = rl; i < ru; ++i) {
                double[] wvals = w.values(i);
                int wix = w.pos(i);
                for (int j = 0; j < n; ++j) {
                    double wij = wvals[wix + j];
                    if (wij == 0.0) continue;
                    double uvij = LibMatrixMult.dotProductGeneric(mU, mV, i, j, cd);
                    wceval += wij * Math.log(uvij + eps);
                }
            }
        }
        ret.quickSetValue(0, 0, wceval);
    }

    private static void matrixMultWuMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) {
        DenseBlock w = mW.getDenseBlock();
        DenseBlock c = ret.getDenseBlock();
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int n = mW.clen;
        int cd = mU.clen;
        boolean flagmult = wt == WeightedUnaryMM.WUMMType.MULT;
        int blocksizeIJ = 16;
        for (int bi = rl; bi < ru; bi += 16) {
            int bimin = Math.min(ru, bi + 16);
            for (int bj = 0; bj < n; bj += 16) {
                int bjmin = Math.min(n, bj + 16);
                for (int i = bi; i < bimin; ++i) {
                    double[] wvals = w.values(i);
                    double[] uvals = u.values(i);
                    double[] cvals = c.values(i);
                    int wix = w.pos(i);
                    int uix = u.pos(i);
                    for (int j = bj; j < bjmin; ++j) {
                        double wij = wvals[wix + j];
                        if (wij == 0.0) continue;
                        cvals[wix + j] = LibMatrixMult.wumm(wij, uvals, v.values(j), uix, v.pos(j), flagmult, fn, cd);
                    }
                }
            }
        }
    }

    private static void matrixMultWuMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) {
        SparseBlock w = mW.sparseBlock;
        SparseBlock c = ret.sparseBlock;
        DenseBlock u = mU.getDenseBlock();
        DenseBlock v = mV.getDenseBlock();
        int cd = mU.clen;
        boolean flagmult = wt == WeightedUnaryMM.WUMMType.MULT;
        for (int i = rl; i < ru; ++i) {
            if (w.isEmpty(i)) continue;
            int wpos = w.pos(i);
            int wlen = w.size(i);
            int[] wix = w.indexes(i);
            double[] wvals = w.values(i);
            double[] uvals = u.values(i);
            int uix = u.pos(i);
            c.allocate(i, wlen);
            for (int k = wpos; k < wpos + wlen; ++k) {
                double cval = LibMatrixMult.wumm(wvals[k], uvals, v.values(wix[k]), uix, v.pos(wix[k]), flagmult, fn, cd);
                c.append(i, wix[k], cval);
            }
        }
    }

    private static void matrixMultWuMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) {
        boolean flagmult;
        int n = mW.clen;
        int cd = mU.clen;
        boolean bl = flagmult = wt == WeightedUnaryMM.WUMMType.MULT;
        if (mW.sparse) {
            SparseBlock w = mW.sparseBlock;
            SparseBlock c = ret.sparseBlock;
            for (int i = rl; i < ru; ++i) {
                if (w.isEmpty(i)) continue;
                int wpos = w.pos(i);
                int wlen = w.size(i);
                int[] wix = w.indexes(i);
                double[] wval = w.values(i);
                c.allocate(i, wlen);
                for (int k = wpos; k < wpos + wlen; ++k) {
                    double cval = LibMatrixMult.wumm(wval[k], mU, mV, i, wix[k], flagmult, fn, cd);
                    c.append(i, wix[k], cval);
                }
            }
        } else {
            DenseBlock w = mW.getDenseBlock();
            DenseBlock c = ret.getDenseBlock();
            for (int i = rl; i < ru; ++i) {
                double[] wvals = w.values(i);
                double[] cvals = c.values(i);
                int ix = w.pos(i);
                for (int j = 0; j < n; ++j) {
                    double wij = wvals[ix + j];
                    if (wij == 0.0) continue;
                    cvals[ix + j] = LibMatrixMult.wumm(wij, mU, mV, i, j, flagmult, fn, cd);
                }
            }
        }
    }

    private static double dotProduct(double[] a, double[] b, int len) {
        int i;
        double val = 0.0;
        int bn = len % 8;
        for (i = 0; i < bn; ++i) {
            val += a[i] * b[i];
        }
        for (i = bn; i < len; i += 8) {
            val += a[i + 0] * b[i + 0] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3] + a[i + 4] * b[i + 4] + a[i + 5] * b[i + 5] + a[i + 6] * b[i + 6] + a[i + 7] * b[i + 7];
        }
        return val;
    }

    public static double dotProduct(double[] a, double[] b, int ai, int bi, int len) {
        double val = 0.0;
        int bn = len % 8;
        int i = 0;
        while (i < bn) {
            val += a[ai] * b[bi];
            ++i;
            ++ai;
            ++bi;
        }
        i = bn;
        while (i < len) {
            val += a[ai + 0] * b[bi + 0] + a[ai + 1] * b[bi + 1] + a[ai + 2] * b[bi + 2] + a[ai + 3] * b[bi + 3] + a[ai + 4] * b[bi + 4] + a[ai + 5] * b[bi + 5] + a[ai + 6] * b[bi + 6] + a[ai + 7] * b[bi + 7];
            i += 8;
            ai += 8;
            bi += 8;
        }
        return val;
    }

    public static double dotProduct(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
        int i;
        double val = 0.0;
        int bn = len % 8;
        for (i = ai; i < ai + bn; ++i) {
            val += a[i] * b[bi + aix[i]];
        }
        for (i = ai + bn; i < ai + len; i += 8) {
            val += a[i + 0] * b[bi + aix[i + 0]] + a[i + 1] * b[bi + aix[i + 1]] + a[i + 2] * b[bi + aix[i + 2]] + a[i + 3] * b[bi + aix[i + 3]] + a[i + 4] * b[bi + aix[i + 4]] + a[i + 5] * b[bi + aix[i + 5]] + a[i + 6] * b[bi + aix[i + 6]] + a[i + 7] * b[bi + aix[i + 7]];
        }
        return val;
    }

    public static void vectMultiplyAdd(double aval, double[] b, double[] c, int bi, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + aval * b[bi];
            ++j;
            ++bi;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + aval * b[bi + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] + aval * b[bi + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] + aval * b[bi + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] + aval * b[bi + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] + aval * b[bi + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] + aval * b[bi + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] + aval * b[bi + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] + aval * b[bi + 7];
            j += 8;
            bi += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd2(double aval1, double aval2, double[] b, double[] c, int bi1, int bi2, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (aval1 * b[bi1] + aval2 * b[bi2]);
            ++j;
            ++bi1;
            ++bi2;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7]);
            j += 8;
            bi1 += 8;
            bi2 += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd3(double aval1, double aval2, double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (aval1 * b[bi1] + aval2 * b[bi2] + aval3 * b[bi3]);
            ++j;
            ++bi1;
            ++bi2;
            ++bi3;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0] + aval3 * b[bi3 + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1] + aval3 * b[bi3 + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2] + aval3 * b[bi3 + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3] + aval3 * b[bi3 + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4] + aval3 * b[bi3 + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5] + aval3 * b[bi3 + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6] + aval3 * b[bi3 + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7] + aval3 * b[bi3 + 7]);
            j += 8;
            bi1 += 8;
            bi2 += 8;
            bi3 += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd4(double aval1, double aval2, double aval3, double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (aval1 * b[bi1] + aval2 * b[bi2] + aval3 * b[bi3] + aval4 * b[bi4]);
            ++j;
            ++bi1;
            ++bi2;
            ++bi3;
            ++bi4;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0] + aval3 * b[bi3 + 0] + aval4 * b[bi4 + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1] + aval3 * b[bi3 + 1] + aval4 * b[bi4 + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2] + aval3 * b[bi3 + 2] + aval4 * b[bi4 + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3] + aval3 * b[bi3 + 3] + aval4 * b[bi4 + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4] + aval3 * b[bi3 + 4] + aval4 * b[bi4 + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5] + aval3 * b[bi3 + 5] + aval4 * b[bi4 + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6] + aval3 * b[bi3 + 6] + aval4 * b[bi4 + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7] + aval3 * b[bi3 + 7] + aval4 * b[bi4 + 7]);
            j += 8;
            bi1 += 8;
            bi2 += 8;
            bi3 += 8;
            bi4 += 8;
            ci += 8;
        }
    }

    private static void vectMultiplyAdd(double aval, double[] b, double[] c, int[] bix, int ci, int len) {
        int j;
        int bn = len % 8;
        for (j = 0; j < bn; ++j) {
            int n = ci + bix[j];
            c[n] = c[n] + aval * b[j];
        }
        for (j = bn; j < len; j += 8) {
            int n = ci + bix[j + 0];
            c[n] = c[n] + aval * b[j + 0];
            int n2 = ci + bix[j + 1];
            c[n2] = c[n2] + aval * b[j + 1];
            int n3 = ci + bix[j + 2];
            c[n3] = c[n3] + aval * b[j + 2];
            int n4 = ci + bix[j + 3];
            c[n4] = c[n4] + aval * b[j + 3];
            int n5 = ci + bix[j + 4];
            c[n5] = c[n5] + aval * b[j + 4];
            int n6 = ci + bix[j + 5];
            c[n6] = c[n6] + aval * b[j + 5];
            int n7 = ci + bix[j + 6];
            c[n7] = c[n7] + aval * b[j + 6];
            int n8 = ci + bix[j + 7];
            c[n8] = c[n8] + aval * b[j + 7];
        }
    }

    public static void vectMultiplyAdd(double aval, double[] b, double[] c, int[] bix, int bi, int ci, int len) {
        int j;
        int bn = len % 8;
        for (j = bi; j < bi + bn; ++j) {
            int n = ci + bix[j];
            c[n] = c[n] + aval * b[j];
        }
        for (j = bi + bn; j < bi + len; j += 8) {
            int n = ci + bix[j + 0];
            c[n] = c[n] + aval * b[j + 0];
            int n2 = ci + bix[j + 1];
            c[n2] = c[n2] + aval * b[j + 1];
            int n3 = ci + bix[j + 2];
            c[n3] = c[n3] + aval * b[j + 2];
            int n4 = ci + bix[j + 3];
            c[n4] = c[n4] + aval * b[j + 3];
            int n5 = ci + bix[j + 4];
            c[n5] = c[n5] + aval * b[j + 4];
            int n6 = ci + bix[j + 5];
            c[n6] = c[n6] + aval * b[j + 5];
            int n7 = ci + bix[j + 6];
            c[n7] = c[n7] + aval * b[j + 6];
            int n8 = ci + bix[j + 7];
            c[n8] = c[n8] + aval * b[j + 7];
        }
    }

    public static void vectMultiplyWrite(double aval, double[] b, double[] c, int bi, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            c[ci] = aval * b[bi];
            ++j;
            ++bi;
            ++ci;
        }
        j = bn;
        while (j < len) {
            c[ci + 0] = aval * b[bi + 0];
            c[ci + 1] = aval * b[bi + 1];
            c[ci + 2] = aval * b[bi + 2];
            c[ci + 3] = aval * b[bi + 3];
            c[ci + 4] = aval * b[bi + 4];
            c[ci + 5] = aval * b[bi + 5];
            c[ci + 6] = aval * b[bi + 6];
            c[ci + 7] = aval * b[bi + 7];
            j += 8;
            bi += 8;
            ci += 8;
        }
    }

    public static void vectMultiplyInPlace(double aval, double[] c, int ci, int len) {
        int j;
        int bn = len % 8;
        for (j = 0; j < bn; ++j) {
            int n = ci++;
            c[n] = c[n] * aval;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] * aval;
            int n2 = ci + 1;
            c[n2] = c[n2] * aval;
            int n3 = ci + 2;
            c[n3] = c[n3] * aval;
            int n4 = ci + 3;
            c[n4] = c[n4] * aval;
            int n5 = ci + 4;
            c[n5] = c[n5] * aval;
            int n6 = ci + 5;
            c[n6] = c[n6] * aval;
            int n7 = ci + 6;
            c[n7] = c[n7] * aval;
            int n8 = ci + 7;
            c[n8] = c[n8] * aval;
            j += 8;
            ci += 8;
        }
    }

    public static void vectMultiplyWrite(double[] a, double[] b, double[] c, int ai, int bi, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            c[ci] = a[ai] * b[bi];
            ++j;
            ++ai;
            ++bi;
            ++ci;
        }
        j = bn;
        while (j < len) {
            c[ci + 0] = a[ai + 0] * b[bi + 0];
            c[ci + 1] = a[ai + 1] * b[bi + 1];
            c[ci + 2] = a[ai + 2] * b[bi + 2];
            c[ci + 3] = a[ai + 3] * b[bi + 3];
            c[ci + 4] = a[ai + 4] * b[bi + 4];
            c[ci + 5] = a[ai + 5] * b[bi + 5];
            c[ci + 6] = a[ai + 6] * b[bi + 6];
            c[ci + 7] = a[ai + 7] * b[bi + 7];
            j += 8;
            ai += 8;
            bi += 8;
            ci += 8;
        }
    }

    public static void vectMultiplyWrite(double[] a, double[] b, double[] c, int[] bix, int ai, int bi, int ci, int len) {
        int j;
        int bn = len % 8;
        for (j = bi; j < bi + bn; ++j) {
            c[ci + bix[j]] = a[ai + bix[j]] * b[j];
        }
        for (j = bi + bn; j < bi + len; j += 8) {
            c[ci + bix[j + 0]] = a[ai + bix[j + 0]] * b[j + 0];
            c[ci + bix[j + 1]] = a[ai + bix[j + 1]] * b[j + 1];
            c[ci + bix[j + 2]] = a[ai + bix[j + 2]] * b[j + 2];
            c[ci + bix[j + 3]] = a[ai + bix[j + 3]] * b[j + 3];
            c[ci + bix[j + 4]] = a[ai + bix[j + 4]] * b[j + 4];
            c[ci + bix[j + 5]] = a[ai + bix[j + 5]] * b[j + 5];
            c[ci + bix[j + 6]] = a[ai + bix[j + 6]] * b[j + 6];
            c[ci + bix[j + 7]] = a[ai + bix[j + 7]] * b[j + 7];
        }
    }

    private static void vectMultiply(double[] a, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] * a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] * a[ai + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] * a[ai + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] * a[ai + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] * a[ai + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] * a[ai + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] * a[ai + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] * a[ai + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] * a[ai + 7];
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    public static void vectAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (a[ai + 0] + bval);
            int n2 = ci + 1;
            c[n2] = c[n2] + (a[ai + 1] + bval);
            int n3 = ci + 2;
            c[n3] = c[n3] + (a[ai + 2] + bval);
            int n4 = ci + 3;
            c[n4] = c[n4] + (a[ai + 3] + bval);
            int n5 = ci + 4;
            c[n5] = c[n5] + (a[ai + 4] + bval);
            int n6 = ci + 5;
            c[n6] = c[n6] + (a[ai + 5] + bval);
            int n7 = ci + 6;
            c[n7] = c[n7] + (a[ai + 6] + bval);
            int n8 = ci + 7;
            c[n8] = c[n8] + (a[ai + 7] + bval);
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    public static void vectAdd(double[] a, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + a[ai + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] + a[ai + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] + a[ai + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] + a[ai + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] + a[ai + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] + a[ai + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] + a[ai + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] + a[ai + 7];
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    public static void vectAdd(double[] a, double[] c, int[] aix, int ai, int ci, int alen) {
        int j;
        int bn = alen % 8;
        for (j = ai; j < ai + bn; ++j) {
            int n = ci + aix[j];
            c[n] = c[n] + a[j];
        }
        for (j = ai + bn; j < ai + alen; j += 8) {
            int n = ci + aix[j + 0];
            c[n] = c[n] + a[j + 0];
            int n2 = ci + aix[j + 1];
            c[n2] = c[n2] + a[j + 1];
            int n3 = ci + aix[j + 2];
            c[n3] = c[n3] + a[j + 2];
            int n4 = ci + aix[j + 3];
            c[n4] = c[n4] + a[j + 3];
            int n5 = ci + aix[j + 4];
            c[n5] = c[n5] + a[j + 4];
            int n6 = ci + aix[j + 5];
            c[n6] = c[n6] + a[j + 5];
            int n7 = ci + aix[j + 6];
            c[n7] = c[n7] + a[j + 6];
            int n8 = ci + aix[j + 7];
            c[n8] = c[n8] + a[j + 7];
        }
    }

    private static void vectAdd4(double[] a1, double[] a2, double[] a3, double[] a4, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] + (a1[ai] + a2[ai] + a3[ai] + a4[ai]);
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] + (a1[ai + 0] + a2[ai + 0] + a3[ai + 0] + a4[ai + 0]);
            int n2 = ci + 1;
            c[n2] = c[n2] + (a1[ai + 1] + a2[ai + 1] + a3[ai + 1] + a4[ai + 1]);
            int n3 = ci + 2;
            c[n3] = c[n3] + (a1[ai + 2] + a2[ai + 2] + a3[ai + 2] + a4[ai + 2]);
            int n4 = ci + 3;
            c[n4] = c[n4] + (a1[ai + 3] + a2[ai + 3] + a3[ai + 3] + a4[ai + 3]);
            int n5 = ci + 4;
            c[n5] = c[n5] + (a1[ai + 4] + a2[ai + 4] + a3[ai + 4] + a4[ai + 4]);
            int n6 = ci + 5;
            c[n6] = c[n6] + (a1[ai + 5] + a2[ai + 5] + a3[ai + 5] + a4[ai + 5]);
            int n7 = ci + 6;
            c[n7] = c[n7] + (a1[ai + 6] + a2[ai + 6] + a3[ai + 6] + a4[ai + 6]);
            int n8 = ci + 7;
            c[n8] = c[n8] + (a1[ai + 7] + a2[ai + 7] + a3[ai + 7] + a4[ai + 7]);
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    private static void vectAddAll(double[][] a, double[] c, int ai, int ci, int len) {
        int i;
        int bi = a.length % 4;
        for (i = 0; i < bi; ++i) {
            LibMatrixMult.vectAdd(a[i], c, ai, ci, len);
        }
        for (i = bi; i < a.length; i += 4) {
            LibMatrixMult.vectAdd4(a[i], a[i + 1], a[i + 2], a[i + 3], c, ai, ci, len);
        }
    }

    public static void vectAddInPlace(double aval, double[] c, int ci, int len) {
        int bn = len % 8;
        int j = ci;
        while (j < ci + bn) {
            int n = j++;
            c[n] = c[n] + aval;
        }
        for (j = ci + bn; j < ci + len; j += 8) {
            int n = j + 0;
            c[n] = c[n] + aval;
            int n2 = j + 1;
            c[n2] = c[n2] + aval;
            int n3 = j + 2;
            c[n3] = c[n3] + aval;
            int n4 = j + 3;
            c[n4] = c[n4] + aval;
            int n5 = j + 4;
            c[n5] = c[n5] + aval;
            int n6 = j + 5;
            c[n6] = c[n6] + aval;
            int n7 = j + 6;
            c[n7] = c[n7] + aval;
            int n8 = j + 7;
            c[n8] = c[n8] + aval;
        }
    }

    private static void vectSubtract(double[] a, double[] c, int ai, int ci, int len) {
        int bn = len % 8;
        int j = 0;
        while (j < bn) {
            int n = ci++;
            c[n] = c[n] - a[ai];
            ++j;
            ++ai;
        }
        j = bn;
        while (j < len) {
            int n = ci + 0;
            c[n] = c[n] - a[ai + 0];
            int n2 = ci + 1;
            c[n2] = c[n2] - a[ai + 1];
            int n3 = ci + 2;
            c[n3] = c[n3] - a[ai + 2];
            int n4 = ci + 3;
            c[n4] = c[n4] - a[ai + 3];
            int n5 = ci + 4;
            c[n5] = c[n5] - a[ai + 4];
            int n6 = ci + 5;
            c[n6] = c[n6] - a[ai + 5];
            int n7 = ci + 6;
            c[n7] = c[n7] - a[ai + 6];
            int n8 = ci + 7;
            c[n8] = c[n8] - a[ai + 7];
            j += 8;
            ai += 8;
            ci += 8;
        }
    }

    private static double wsigmoid(double wij, double[] u, double[] v, int uix, int vix, boolean flagminus, boolean flaglog, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double cval = flagminus ? 1.0 / (1.0 + FastMath.exp((double)uvij)) : 1.0 / (1.0 + FastMath.exp((double)(-uvij)));
        return wij * (flaglog ? Math.log(cval) : cval);
    }

    private static double wsigmoid(double wij, MatrixBlock u, MatrixBlock v, int uix, int vix, boolean flagminus, boolean flaglog, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double cval = flagminus ? 1.0 / (1.0 + FastMath.exp((double)uvij)) : 1.0 / (1.0 + FastMath.exp((double)(-uvij)));
        return wij * (flaglog ? Math.log(cval) : cval);
    }

    private static void wdivmm(double wij, double[] u, double[] v, double[] c, int uix, int vix, boolean left, boolean mult, boolean minus, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double tmpval = minus ? uvij - wij : (mult ? wij * uvij : wij / uvij);
        int bix = left ? uix : vix;
        int cix = left ? vix : uix;
        double[] b = left ? u : v;
        LibMatrixMult.vectMultiplyAdd(tmpval, b, c, bix, cix, len);
    }

    private static void wdivmm(double wij, double xij, double[] u, double[] v, double[] c, int uix, int vix, boolean left, boolean scalar, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double tmpval = scalar ? wij / (uvij + xij) : wij * (uvij - xij);
        int bix = left ? uix : vix;
        int cix = left ? vix : uix;
        double[] b = left ? u : v;
        LibMatrixMult.vectMultiplyAdd(tmpval, b, c, bix, cix, len);
    }

    private static void wdivmm(double wij, MatrixBlock u, MatrixBlock v, double[] c, int uix, int vix, boolean left, boolean mult, boolean minus, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double wtmp = minus ? uvij - wij : (mult ? wij * uvij : wij / uvij);
        int bix = left ? uix : vix;
        int cix = left ? vix * len : uix * len;
        MatrixBlock b = left ? u : v;
        for (int k2 = 0; k2 < len; ++k2) {
            int n = cix + k2;
            c[n] = c[n] + b.quickGetValue(bix, k2) * wtmp;
        }
    }

    private static void wdivmm(double wij, double xij, MatrixBlock u, MatrixBlock v, double[] c, int uix, int vix, boolean left, boolean scalar, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double wtmp = scalar ? wij / (uvij + xij) : wij * (uvij - xij);
        int bix = left ? uix : vix;
        int cix = left ? vix * len : uix * len;
        MatrixBlock b = left ? u : v;
        for (int k2 = 0; k2 < len; ++k2) {
            int n = cix + k2;
            c[n] = c[n] + b.quickGetValue(bix, k2) * wtmp;
        }
    }

    private static double wumm(double wij, double[] u, double[] v, int uix, int vix, boolean flagmult, ValueFunction fn, int len) {
        double uvij = LibMatrixMult.dotProduct(u, v, uix, vix, len);
        double cval = fn.execute(uvij);
        return flagmult ? wij * cval : wij / cval;
    }

    private static double wumm(double wij, MatrixBlock u, MatrixBlock v, int uix, int vix, boolean flagmult, ValueFunction fn, int len) {
        double uvij = LibMatrixMult.dotProductGeneric(u, v, uix, vix, len);
        double cval = fn.execute(uvij);
        return flagmult ? wij * cval : wij / cval;
    }

    private static double dotProductGeneric(MatrixBlock a, MatrixBlock b, int ai, int bi, int len) {
        double val = 0.0;
        for (int k2 = 0; k2 < len; ++k2) {
            val += a.quickGetValue(ai, k2) * b.quickGetValue(bi, k2);
        }
        return val;
    }

    private static double dotProductGeneric(MatrixBlock a, MatrixBlock b) {
        double val = 0.0;
        for (int i = 0; i < a.getNumRows(); ++i) {
            for (int j = 0; j < a.getNumColumns(); ++j) {
                val += a.quickGetValue(i, j) * b.quickGetValue(i, j);
            }
        }
        return val;
    }

    public static long copyUpperToLowerTriangle(MatrixBlock ret) {
        int bimin;
        int bi;
        if (ret.rlen != ret.clen) {
            throw new RuntimeException("Invalid non-squared input matrix.");
        }
        double[] c = ret.getDenseBlockValues();
        int n = ret.rlen;
        long nnz = 0L;
        int blocksizeIJ = 128;
        for (bi = 0; bi < n; bi += 128) {
            bimin = Math.min(bi + 128, n);
            int i = bi;
            int rix = bi * n;
            while (i < bimin) {
                LibMatrixReorg.transposeRow(c, c, rix + bi, bi * n + i, n, bimin - bi);
                nnz += c[rix + i] != 0.0 ? 1L : 0L;
                for (int j = rix + i + 1; j < rix + bimin; ++j) {
                    nnz += c[j] != 0.0 ? 2L : 0L;
                }
                ++i;
                rix += n;
            }
        }
        for (bi = 0; bi < n; bi += 128) {
            bimin = Math.min(bi + 128, n);
            for (int bj = bi; bj < n; bj += 128) {
                if (bi == bj) continue;
                int bjmin = Math.min(bj + 128, n);
                int i = bi;
                int rix = bi * n;
                while (i < bimin) {
                    LibMatrixReorg.transposeRow(c, c, rix + bj, bj * n + i, n, bjmin - bj);
                    for (int j = rix + bj; j < rix + bjmin; ++j) {
                        nnz += c[j] != 0.0 ? 2L : 0L;
                    }
                    ++i;
                    rix += n;
                }
            }
        }
        return nnz;
    }

    public static MatrixBlock prepMatrixMultTransposeSelfInput(MatrixBlock m1, boolean leftTranspose, boolean par) {
        SparseBlockCSR sblock;
        boolean convertDense;
        MatrixBlock ret = m1;
        int rlen = m1.rlen;
        int clen = m1.clen;
        if (!leftTranspose && m1.sparse && rlen > 1) {
            MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, m1.sparse);
            LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
            ret = tmpBlock;
        } else if (leftTranspose && m1.sparse && m1.sparseBlock instanceof SparseBlockCSR && (convertDense = (par ? IntStream.range(0, rlen).parallel() : IntStream.range(0, rlen)).allMatch(arg_0 -> LibMatrixMult.lambda$prepMatrixMultTransposeSelfInput$0(sblock = (SparseBlockCSR)m1.sparseBlock, clen, arg_0)))) {
            int rows = (int)sblock.size() / clen;
            MatrixBlock tmpBlock = new MatrixBlock(rows, clen, false);
            tmpBlock.denseBlock = DenseBlockFactory.createDenseBlock(sblock.values(), rows, clen);
            tmpBlock.setNonZeros(m1.nonZeros);
            ret = tmpBlock;
        }
        return ret;
    }

    private static boolean checkPrepMatrixMultRightInput(MatrixBlock m1, MatrixBlock m2) {
        return !m1.sparse && !m2.sparse && LibMatrixMult.isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen, true);
    }

    public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen, boolean inclCacheSize) {
        return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1L && m2clen < 64L && (!inclCacheSize || 8L * m2rlen * m2clen < 262144L);
    }

    private static boolean checkParMatrixMultRightInputRows(MatrixBlock m1, MatrixBlock m2, int k) {
        return m1.rlen == 1 && m2.clen > 1 && !m1.isUltraSparse() && !m2.isUltraSparse() || m1.rlen <= 16 && m2.clen > 1 && m2.rlen > m1.rlen && !m1.isUltraSparse() && !m2.sparse && (long)k * 8L * (long)m1.rlen * (long)m2.clen < 0x200000L;
    }

    private static boolean checkParMatrixMultRightInputCols(MatrixBlock m1, MatrixBlock m2, int k, boolean pm2r) {
        return !m1.sparse && !m2.sparse && m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r && 8 * m1.rlen * m1.clen < 262144;
    }

    public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, int k) {
        return LibMatrixMult.satisfiesMultiThreadingConstraints(m1, true, false, -1L, k);
    }

    public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
        boolean sharedTP = InfrastructureAnalyzer.getLocalParallelism() == k;
        double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
        return !(k <= 1 || checkMem && !((double)(8L * (long)m1.clen * (long)k) < Math.max(2097152.0, 0.01 * jvmMem)) || checkFLOPs && FPfactor * (long)m1.rlen * (long)m1.clen <= (sharedTP ? 131072L : 0x200000L));
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
        boolean sharedTP = InfrastructureAnalyzer.getLocalParallelism() == k;
        double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
        if (k <= 1) return false;
        if (checkMem) {
            if (!((double)(8L * (long)m2.clen * (long)k) < Math.max(2097152.0, 0.01 * jvmMem))) return false;
        }
        if (!checkFLOPs) return true;
        double d = (double)FPfactor * (double)m1.rlen * (double)m1.clen * (double)m2.clen;
        long l = sharedTP ? 131072L : 0x200000L;
        if (!(d > (double)l)) return false;
        return true;
    }

    private static boolean satisfiesMultiThreadingConstraintsTSMM(MatrixBlock m1, boolean leftTranspose, long FPfactor, int k) {
        double threshold;
        boolean sharedTP = InfrastructureAnalyzer.getLocalParallelism() == k;
        double d = threshold = sharedTP ? 131072.0 : 2097152.0;
        return k > 1 && (leftTranspose ? m1.clen : m1.rlen) != 1 && (leftTranspose && (double)(FPfactor * (long)m1.rlen * (long)m1.clen * (long)m1.clen) > threshold || !leftTranspose && (double)(FPfactor * (long)m1.clen * (long)m1.rlen * (long)m1.rlen) > threshold);
    }

    public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2, boolean m1Perm) {
        if (m2.clen == 1) {
            return false;
        }
        double outSp = OptimizerUtils.getMatMultSparsity(m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, true);
        return m1.isUltraSparse() || m2.isUltraSparse() || m1.isUltraSparse(false) && m1 == m2 || m1Perm && OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros) < 1.0 || (m1.isUltraSparse(false) || m2.isUltraSparse(false)) && outSp < 4.0E-4 || m1.getSparsity() < 4.0E-4 && m1.getNonZeros() < 40L && m1.getLength() + m2.getLength() < (long)m1.rlen * (long)m2.clen && outSp < 0.4;
    }

    public static boolean isSparseOutputMatrixMult(MatrixBlock m1, MatrixBlock m2) {
        if (!m1.sparse || !m2.sparse || m1.rlen <= 1 || m2.clen <= 1) {
            return false;
        }
        double estSp = OptimizerUtils.getMatMultSparsity(m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, false);
        long estNnz = (long)(estSp * (double)m1.rlen * (double)m2.clen);
        boolean sparseOut = MatrixBlock.evalSparseFormatInMemory(m1.rlen, m2.clen, estNnz);
        return m2.clen < 4096 && sparseOut;
    }

    public static boolean isOuterProductTSMM(int rlen, int clen, boolean left) {
        return left ? rlen == 1 & clen > 1 : rlen > 1 & clen == 1;
    }

    private static MatrixBlock prepMatrixMultRightInput(MatrixBlock m1, MatrixBlock m2) {
        MatrixBlock ret = m2;
        if (LibMatrixMult.checkPrepMatrixMultRightInput(m1, m2)) {
            MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse);
            LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
            ret = tmpBlock;
        }
        return ret;
    }

    private static int copyNonZeroElements(double[] a, int aixi, int bixk, int n, double[] tmpa, int[] tmpbi, int bklen) {
        int knnz = 0;
        for (int k = 0; k < bklen; ++k) {
            if (a[aixi + k] == 0.0) continue;
            tmpa[knnz] = a[aixi + k];
            tmpbi[knnz] = bixk + k * n;
            ++knnz;
        }
        return knnz;
    }

    private static int copyNonZeroElements(double[] a, int aixi, int bixk, int n, int nx, double[] tmpa, int[] tmpbi, int bklen) {
        int knnz = 0;
        int k = 0;
        while (k < bklen) {
            if (a[aixi] != 0.0) {
                tmpa[knnz] = a[aixi];
                tmpbi[knnz] = bixk;
                ++knnz;
            }
            ++k;
            aixi += n;
            bixk += nx;
        }
        return knnz;
    }

    private static void compactSparseOutput(MatrixBlock ret) {
        if (!ret.sparse || ret.nonZeros > (long)ret.rlen || ret.isEmpty() || ret.getSparseBlock() instanceof SparseBlockCSR) {
            return;
        }
        ret.sparseBlock = SparseBlockFactory.copySparseBlock(SparseBlock.Type.CSR, ret.sparseBlock, false);
    }

    private static void resetPosVect(int[] curk, SparseBlock sblock, int rl, int ru) {
        if (sblock instanceof SparseBlockMCSR) {
            Arrays.fill(curk, 0, ru - rl, 0);
        } else if (sblock instanceof SparseBlockCSR) {
            SparseBlockCSR csr = (SparseBlockCSR)sblock;
            System.arraycopy(csr.rowPointers(), rl, curk, 0, ru - rl);
        } else {
            for (int i = rl; i < ru; ++i) {
                curk[i - rl] = sblock.pos(i);
            }
        }
    }

    private static void sumScalarResults(List<Future<Double>> tasks, MatrixBlock ret) throws InterruptedException, ExecutionException {
        double val = 0.0;
        for (Future<Double> task : tasks) {
            val += task.get().doubleValue();
        }
        ret.quickSetValue(0, 0, val);
    }

    private static void sumDenseResults(double[][] partret, double[] ret) {
        int len = ret.length;
        int k = partret.length;
        int bk = k % 4;
        int blocksize = 2048;
        for (int bi = 0; bi < len; bi += 2048) {
            int j;
            int llen = Math.min(len - bi, 2048);
            for (j = 0; j < bk; ++j) {
                LibMatrixMult.vectAdd(partret[j], ret, bi, bi, llen);
            }
            for (j = bk; j < k; j += 4) {
                LibMatrixMult.vectAdd4(partret[j], partret[j + 1], partret[j + 2], partret[j + 3], ret, bi, bi, llen);
            }
        }
    }

    private static /* synthetic */ boolean lambda$prepMatrixMultTransposeSelfInput$0(SparseBlockCSR sblock, int clen, int i) {
        return sblock.isEmpty(i) || sblock.size(i) == clen;
    }

    private static class MatrixMultWuTask
    implements Callable<Long> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _ret = null;
        private WeightedUnaryMM.WUMMType _wt = null;
        private ValueFunction _fn = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWuTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedUnaryMM.WUMMType wt, ValueFunction fn, int rl, int ru) {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._ret = ret;
            this._wt = wt;
            this._fn = fn;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Long call() {
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWuMMDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._fn, this._rl, this._ru);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWuMMSparseDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._fn, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWuMMGeneric(this._mW, this._mU, this._mV, this._ret, this._wt, this._fn, this._rl, this._ru);
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class MatrixMultWCeTask
    implements Callable<Double> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private double _eps = 0.0;
        private MatrixBlock _ret = null;
        private WeightedCrossEntropy.WCeMMType _wt = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, WeightedCrossEntropy.WCeMMType wt, int rl, int ru) {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._eps = eps;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
            this._ret = new MatrixBlock(1, 1, false);
            this._ret.allocateDenseBlock();
        }

        @Override
        public Double call() {
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWCeMMDense(this._mW, this._mU, this._mV, this._eps, this._ret, this._wt, this._rl, this._ru);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWCeMMSparseDense(this._mW, this._mU, this._mV, this._eps, this._ret, this._wt, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWCeMMGeneric(this._mW, this._mU, this._mV, this._eps, this._ret, this._wt, this._rl, this._ru);
            }
            return this._ret.quickGetValue(0, 0);
        }
    }

    private static class MatrixMultWDivTask
    implements Callable<Long> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _mX = null;
        private MatrixBlock _ret = null;
        private WeightedDivMM.WDivMMType _wt = null;
        private int _rl = -1;
        private int _ru = -1;
        private int _cl = -1;
        private int _cu = -1;

        protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WeightedDivMM.WDivMMType wt, int rl, int ru, int cl, int cu) {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._mX = mX;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
            this._cl = cl;
            this._cu = cu;
            this._ret = ret;
        }

        @Override
        public Long call() {
            boolean scalarX = this._wt.hasScalar();
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mX != null && this._mX.sparse && !scalarX || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWDivMMDense(this._mW, this._mU, this._mV, this._mX, this._ret, this._wt, this._rl, this._ru, this._cl, this._cu);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mX != null && !this._mX.sparse && !scalarX || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWDivMMSparseDense(this._mW, this._mU, this._mV, this._mX, this._ret, this._wt, this._rl, this._ru, this._cl, this._cu);
            } else {
                LibMatrixMult.matrixMultWDivMMGeneric(this._mW, this._mU, this._mV, this._mX, this._ret, this._wt, this._rl, this._ru, this._cl, this._cu);
            }
            int rl = this._wt.isLeft() ? this._cl : this._rl;
            int ru = this._wt.isLeft() ? this._cu : this._ru;
            return this._ret.recomputeNonZeros(rl, ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class MatrixMultWSigmoidTask
    implements Callable<Long> {
        private MatrixBlock _mW = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _ret = null;
        private WeightedSigmoid.WSigmoidType _wt = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WeightedSigmoid.WSigmoidType wt, int rl, int ru) {
            this._mW = mW;
            this._mU = mU;
            this._mV = mV;
            this._ret = ret;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Long call() {
            if (!(this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSigmoidDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._rl, this._ru);
            } else if (!(!this._mW.sparse || this._mU.sparse || this._mV.sparse || this._mU.isEmptyBlock() || this._mV.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSigmoidSparseDense(this._mW, this._mU, this._mV, this._ret, this._wt, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWSigmoidGeneric(this._mW, this._mU, this._mV, this._ret, this._wt, this._rl, this._ru);
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1, 0, this._ret.getNumColumns() - 1);
        }
    }

    private static class MatrixMultWSLossTask
    implements Callable<Double> {
        private MatrixBlock _mX = null;
        private MatrixBlock _mU = null;
        private MatrixBlock _mV = null;
        private MatrixBlock _mW = null;
        private MatrixBlock _ret = null;
        private WeightedSquaredLoss.WeightsType _wt = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightedSquaredLoss.WeightsType wt, int rl, int ru) {
            this._mX = mX;
            this._mU = mU;
            this._mV = mV;
            this._mW = mW;
            this._wt = wt;
            this._rl = rl;
            this._ru = ru;
            this._ret = new MatrixBlock(1, 1, false);
            this._ret.allocateDenseBlock();
        }

        @Override
        public Double call() {
            if (!(this._mX.sparse || this._mU.sparse || this._mV.sparse || this._mW != null && this._mW.sparse || this._mX.isEmptyBlock() || this._mU.isEmptyBlock() || this._mV.isEmptyBlock() || this._mW != null && this._mW.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSLossDense(this._mX, this._mU, this._mV, this._mW, this._ret, this._wt, this._rl, this._ru);
            } else if (!(!this._mX.sparse || this._mU.sparse || this._mV.sparse || this._mW != null && !this._mW.sparse || this._mX.isEmptyBlock() || this._mU.isEmptyBlock() || this._mV.isEmptyBlock() || this._mW != null && this._mW.isEmptyBlock())) {
                LibMatrixMult.matrixMultWSLossSparseDense(this._mX, this._mU, this._mV, this._mW, this._ret, this._wt, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultWSLossGeneric(this._mX, this._mU, this._mV, this._mW, this._ret, this._wt, this._rl, this._ru);
            }
            return this._ret.quickGetValue(0, 0);
        }
    }

    private static class MatrixMultPermuteTask
    implements Callable<Object> {
        private MatrixBlock _pm1 = null;
        private MatrixBlock _m2 = null;
        private MatrixBlock _ret1 = null;
        private MatrixBlock _ret2 = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultPermuteTask(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
            this._pm1 = pm1;
            this._m2 = m2;
            this._ret1 = ret1;
            this._ret2 = ret2;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() {
            if (this._m2.sparse) {
                LibMatrixMult.matrixMultPermuteSparse(this._pm1, this._m2, this._ret1, this._ret2, this._rl, this._ru);
            } else if (this._ret1.sparse) {
                LibMatrixMult.matrixMultPermuteDenseSparse(this._pm1, this._m2, this._ret1, this._ret2, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultPermuteDense(this._pm1, this._m2, this._ret1, this._ret2, this._rl, this._ru);
            }
            return null;
        }
    }

    private static class MatrixMultTransposeTask
    implements Callable<Object> {
        private final MatrixBlock _m1;
        private final MatrixBlock _ret;
        private final boolean _left;
        private final int _rl;
        private final int _ru;

        protected MatrixMultTransposeTask(MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru) {
            this._m1 = m1;
            this._ret = ret;
            this._left = left;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public Object call() {
            if (this._m1.sparse) {
                LibMatrixMult.matrixMultTransposeSelfSparse(this._m1, this._ret, this._left, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultTransposeSelfDense(this._m1, this._ret, this._left, this._rl, this._ru);
            }
            return null;
        }
    }

    private static class MatrixMultChainTask
    implements Callable<double[]> {
        private MatrixBlock _m1 = null;
        private MatrixBlock _m2 = null;
        private MatrixBlock _m3 = null;
        private MapMultChain.ChainType _ct = null;
        private int _rl = -1;
        private int _ru = -1;

        protected MatrixMultChainTask(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MapMultChain.ChainType ct, int rl, int ru) {
            this._m1 = mX;
            this._m2 = mV;
            this._m3 = mW;
            this._ct = ct;
            this._rl = rl;
            this._ru = ru;
        }

        @Override
        public double[] call() {
            MatrixBlock ret = new MatrixBlock(1, this._m1.clen, false);
            ret.allocateDenseBlock();
            if (this._m1.sparse) {
                LibMatrixMult.matrixMultChainSparse(this._m1, this._m2, this._m3, ret, this._ct, this._rl, this._ru);
            } else {
                LibMatrixMult.matrixMultChainDense(this._m1, this._m2, this._m3, ret, this._ct, this._rl, this._ru);
            }
            return ret.getDenseBlockValues();
        }
    }

    private static class MatrixMultTask
    implements Callable<Object> {
        private final MatrixBlock _m1;
        private final MatrixBlock _m2;
        private MatrixBlock _ret = null;
        private final boolean _tm2;
        private final boolean _pm2r;
        private final boolean _pm2c;
        private final boolean _m1Perm;
        private final boolean _sparse;
        private final int _rl;
        private final int _ru;

        protected MatrixMultTask(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru) {
            this._m1 = m1;
            this._m2 = m2;
            this._tm2 = tm2;
            this._pm2r = pm2r;
            this._pm2c = pm2c;
            this._m1Perm = m1Perm;
            this._sparse = sparse;
            this._rl = rl;
            this._ru = ru;
            this._ret = pm2r ? new MatrixBlock(ret.rlen, ret.clen, false) : ret;
        }

        @Override
        public Object call() {
            int cu;
            int rl = this._pm2c ? 0 : this._rl;
            int ru = this._pm2c ? this._m1.rlen : this._ru;
            int cl = this._pm2c ? this._rl : 0;
            int n = cu = this._pm2c ? this._ru : this._ret.clen;
            if (this._pm2r) {
                this._ret.allocateDenseBlock();
            }
            if (this._ret.sparse) {
                LibMatrixMult.matrixMultUltraSparse(this._m1, this._m2, this._ret, this._m1Perm, rl, ru);
            } else if (!this._m1.sparse && !this._m2.sparse) {
                LibMatrixMult.matrixMultDenseDense(this._m1, this._m2, this._ret, this._tm2, this._pm2r, rl, ru, cl, cu);
            } else if (this._m1.sparse && this._m2.sparse) {
                LibMatrixMult.matrixMultSparseSparse(this._m1, this._m2, this._ret, this._pm2r, this._sparse, rl, ru);
            } else if (this._m1.sparse) {
                LibMatrixMult.matrixMultSparseDense(this._m1, this._m2, this._ret, this._pm2r, rl, ru);
            } else {
                LibMatrixMult.matrixMultDenseSparse(this._m1, this._m2, this._ret, this._pm2r, rl, ru);
            }
            if (!this._pm2r) {
                return this._ret.recomputeNonZeros(rl, ru - 1, cl, cu - 1);
            }
            return this._ret.getDenseBlockValues();
        }
    }
}

