/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.instructions.cp;

import java.util.ArrayList;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Opcodes;
import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysds.runtime.instructions.InstructionUtils;
import org.apache.sysds.runtime.instructions.cp.CPInstruction;
import org.apache.sysds.runtime.instructions.cp.CPOperand;
import org.apache.sysds.runtime.instructions.cp.UnaryCPInstruction;
import org.apache.sysds.runtime.lineage.LineageItem;
import org.apache.sysds.runtime.lineage.LineageItemUtils;
import org.apache.sysds.runtime.matrix.data.DnnParameters;
import org.apache.sysds.runtime.matrix.data.LibMatrixDNN;
import org.apache.sysds.runtime.matrix.data.LibMatrixNative;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.util.DnnUtils;
import org.apache.sysds.utils.NativeHelper;

public class DnnCPInstruction
extends UnaryCPInstruction {
    private static final Log LOG = LogFactory.getLog((String)DnnCPInstruction.class.getName());
    private static boolean warnedUnderUtilitization = false;
    private final CPOperand _in2;
    private final CPOperand _in3;
    private final CPOperand _in4;
    private final CPOperand _in5;
    private final CPOperand _in6;
    private final CPOperand _in7;
    private final CPOperand _in8;
    private final CPOperand _in9;
    private final CPOperand _in10;
    private final CPOperand _in11;
    private final CPOperand _out2;
    private final CPOperand _out3;
    private final CPOperand _out4;
    private final CPOperand _out5;
    private final ArrayList<CPOperand> _input_shape;
    private final ArrayList<CPOperand> _filter_shape;
    private final ArrayList<CPOperand> _stride;
    private final ArrayList<CPOperand> _padding;
    private final int _numThreads;
    private final double _intermediateMemoryBudget;

    public DnnCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget, String opcode, String istr) {
        super(CPInstruction.CPType.Dnn, null, in, out, opcode, istr);
        this._in2 = in2;
        this._in3 = in3;
        this._in4 = null;
        this._in5 = null;
        this._in6 = null;
        this._in7 = null;
        this._in8 = null;
        this._in9 = null;
        this._in10 = null;
        this._in11 = null;
        this._out2 = null;
        this._out3 = null;
        this._out4 = null;
        this._out5 = null;
        this._stride = stride;
        this._padding = padding;
        this._input_shape = input_shape;
        this._filter_shape = filter_shape;
        this._numThreads = numThreads;
        this._intermediateMemoryBudget = intermediateMemoryBudget;
    }

    public DnnCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) {
        this(in, in2, null, out, null, null, null, null, numThreads, intermediateMemoryBudget, opcode, istr);
        if (!(opcode.equals(Opcodes.BIAS_ADD.toString()) || opcode.equals(Opcodes.RELU_BACKWARD.toString()) || opcode.equals(Opcodes.BIAS_MULTIPLY.toString()))) {
            throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode);
        }
    }

    private DnnCPInstruction(CPOperand in, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
        this(in, null, null, out, stride, padding, input_shape, filter_shape, numThreads, intermediateMemoryBudget, opcode, istr);
    }

    public DnnCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
        this(in, in2, null, out, stride, padding, input_shape, filter_shape, numThreads, intermediateMemoryBudget, opcode, istr);
    }

    public DnnCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
        this(in, in2, in3, out, stride, padding, input_shape, filter_shape, numThreads, intermediateMemoryBudget, opcode, istr);
    }

    public DnnCPInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand in4, CPOperand in5, CPOperand in6, CPOperand in7, CPOperand in8, CPOperand out1, CPOperand out2, CPOperand out3, CPOperand out4, CPOperand out5, String opcode, String str, int i) {
        this(in1, in2, in3, in4, in5, in6, in7, in8, null, null, null, out1, out2, out3, out4, out5, opcode, str, 0.0);
    }

    public DnnCPInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand in4, CPOperand in5, CPOperand in6, CPOperand in7, CPOperand in8, CPOperand in9, CPOperand in10, CPOperand in11, CPOperand out, CPOperand out2, CPOperand out3, CPOperand out4, CPOperand out5, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException {
        super(CPInstruction.CPType.Dnn, null, in1, out, opcode, istr);
        this._in2 = in2;
        this._in3 = in3;
        this._in4 = in4;
        this._in5 = in5;
        this._in6 = in6;
        this._in7 = in7;
        this._in8 = in8;
        this._in9 = in9;
        this._in10 = in10;
        this._in11 = in11;
        this._out2 = out2;
        this._out3 = out3;
        this._out4 = out4;
        this._out5 = out5;
        this._stride = null;
        this._padding = null;
        this._input_shape = null;
        this._filter_shape = null;
        this._numThreads = 0;
        this._intermediateMemoryBudget = intermediateMemoryBudget;
    }

    public static DnnCPInstruction parseInstruction(String str) {
        String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
        String opcode = parts[0];
        if (opcode.equalsIgnoreCase(Opcodes.MAXPOOLING.toString()) || opcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING.toString()) || opcode.equalsIgnoreCase(Opcodes.AVGPOOLING.toString())) {
            InstructionUtils.checkNumFields(parts, 16);
            CPOperand in = new CPOperand(parts[1]);
            CPOperand out = new CPOperand(parts[14]);
            ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
            ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
            ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>();
            ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>();
            stride.add(new CPOperand(parts[2]));
            stride.add(new CPOperand(parts[3]));
            padding.add(new CPOperand(parts[4]));
            padding.add(new CPOperand(parts[5]));
            input_shape.add(new CPOperand(parts[6]));
            input_shape.add(new CPOperand(parts[7]));
            input_shape.add(new CPOperand(parts[8]));
            input_shape.add(new CPOperand(parts[9]));
            filter_shape.add(new CPOperand(parts[10]));
            filter_shape.add(new CPOperand(parts[11]));
            filter_shape.add(new CPOperand(parts[12]));
            filter_shape.add(new CPOperand(parts[13]));
            int k = Integer.parseInt(parts[15]);
            return new DnnCPInstruction(in, out, opcode, str, stride, padding, input_shape, filter_shape, k, Double.parseDouble(parts[16]));
        }
        if (opcode.equalsIgnoreCase(Opcodes.MAXPOOLING_BACKWARD.toString()) || opcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING_BACKWARD.toString()) || opcode.equalsIgnoreCase(Opcodes.AVGPOOLING_BACKWARD.toString()) || opcode.equalsIgnoreCase(Opcodes.CONV2D.toString()) || opcode.equalsIgnoreCase(Opcodes.CONV2D_BACKWARD_FILTER.toString()) || opcode.equalsIgnoreCase(Opcodes.CONV2D_BACKWARD_DATA.toString())) {
            InstructionUtils.checkNumFields(parts, 17);
            CPOperand in = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand out = new CPOperand(parts[15]);
            ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
            ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
            ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>();
            ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>();
            stride.add(new CPOperand(parts[3]));
            stride.add(new CPOperand(parts[4]));
            padding.add(new CPOperand(parts[5]));
            padding.add(new CPOperand(parts[6]));
            input_shape.add(new CPOperand(parts[7]));
            input_shape.add(new CPOperand(parts[8]));
            input_shape.add(new CPOperand(parts[9]));
            input_shape.add(new CPOperand(parts[10]));
            filter_shape.add(new CPOperand(parts[11]));
            filter_shape.add(new CPOperand(parts[12]));
            filter_shape.add(new CPOperand(parts[13]));
            filter_shape.add(new CPOperand(parts[14]));
            int k = Integer.parseInt(parts[16]);
            return new DnnCPInstruction(in, in2, out, opcode, str, stride, padding, input_shape, filter_shape, k, Double.parseDouble(parts[17]));
        }
        if (opcode.equalsIgnoreCase(Opcodes.CONV2D_BIAS_ADD.toString())) {
            InstructionUtils.checkNumFields(parts, 18);
            CPOperand in = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand in3 = new CPOperand(parts[3]);
            CPOperand out = new CPOperand(parts[16]);
            ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
            ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
            ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>();
            ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>();
            stride.add(new CPOperand(parts[4]));
            stride.add(new CPOperand(parts[5]));
            padding.add(new CPOperand(parts[6]));
            padding.add(new CPOperand(parts[7]));
            input_shape.add(new CPOperand(parts[8]));
            input_shape.add(new CPOperand(parts[9]));
            input_shape.add(new CPOperand(parts[10]));
            input_shape.add(new CPOperand(parts[11]));
            filter_shape.add(new CPOperand(parts[12]));
            filter_shape.add(new CPOperand(parts[13]));
            filter_shape.add(new CPOperand(parts[14]));
            filter_shape.add(new CPOperand(parts[15]));
            int k = Integer.parseInt(parts[17]);
            return new DnnCPInstruction(in, in2, in3, out, opcode, str, stride, padding, input_shape, filter_shape, k, Double.parseDouble(parts[18]));
        }
        if (opcode.equalsIgnoreCase(Opcodes.BIAS_ADD.toString()) || opcode.equals(Opcodes.RELU_BACKWARD.toString()) || opcode.equalsIgnoreCase(Opcodes.BIAS_MULTIPLY.toString())) {
            InstructionUtils.checkNumFields(parts, 5);
            CPOperand in = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand out = new CPOperand(parts[3]);
            int k = Integer.parseInt(parts[4]);
            return new DnnCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5]));
        }
        if (opcode.equalsIgnoreCase(Opcodes.BATCH_NORM2D.toString())) {
            InstructionUtils.checkNumFields(parts, 14);
            CPOperand in1 = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand in3 = new CPOperand(parts[3]);
            CPOperand in4 = new CPOperand(parts[4]);
            CPOperand in5 = new CPOperand(parts[5]);
            CPOperand in6 = new CPOperand(parts[6]);
            CPOperand in7 = new CPOperand(parts[7]);
            CPOperand in8 = new CPOperand(parts[8]);
            CPOperand out = new CPOperand(parts[9]);
            CPOperand out2 = new CPOperand(parts[10]);
            CPOperand out3 = new CPOperand(parts[11]);
            CPOperand out4 = new CPOperand(parts[12]);
            CPOperand out5 = new CPOperand(parts[13]);
            return new DnnCPInstruction(in1, in2, in3, in4, in5, in6, in7, in8, out, out2, out3, out4, out5, opcode, str, 0);
        }
        if (opcode.equalsIgnoreCase(Opcodes.BATCH_NORM2D_BACKWARD.toString())) {
            InstructionUtils.checkNumFields(parts, 10);
            CPOperand in1 = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand in3 = new CPOperand(parts[3]);
            CPOperand in4 = new CPOperand(parts[4]);
            CPOperand in5 = new CPOperand(parts[5]);
            CPOperand in6 = new CPOperand(parts[6]);
            CPOperand out = new CPOperand(parts[7]);
            CPOperand out2 = new CPOperand(parts[8]);
            CPOperand out3 = new CPOperand(parts[9]);
            return new DnnCPInstruction(in1, in2, in3, in4, in5, in6, null, null, out, out2, out3, null, null, opcode, str, 0);
        }
        if (opcode.equalsIgnoreCase(Opcodes.LSTM.toString())) {
            InstructionUtils.checkNumFields(parts, 12);
            CPOperand in1 = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand in3 = new CPOperand(parts[3]);
            CPOperand in4 = new CPOperand(parts[4]);
            CPOperand in5 = new CPOperand(parts[5]);
            CPOperand in6 = new CPOperand(parts[6]);
            CPOperand out1 = new CPOperand(parts[7]);
            CPOperand out2 = new CPOperand(parts[8]);
            CPOperand out3 = new CPOperand(parts[9]);
            CPOperand out4 = new CPOperand(parts[10]);
            CPOperand out5 = new CPOperand(parts[11]);
            return new DnnCPInstruction(in1, in2, in3, in4, in5, in6, null, null, out1, out2, out3, out4, out5, opcode, str, 0);
        }
        if (opcode.equalsIgnoreCase(Opcodes.LSTM_BACKWARD.toString())) {
            InstructionUtils.checkNumFields(parts, 17);
            CPOperand in1 = new CPOperand(parts[1]);
            CPOperand in2 = new CPOperand(parts[2]);
            CPOperand in3 = new CPOperand(parts[3]);
            CPOperand in4 = new CPOperand(parts[4]);
            CPOperand in5 = new CPOperand(parts[5]);
            CPOperand in6 = new CPOperand(parts[6]);
            CPOperand in7 = new CPOperand(parts[7]);
            CPOperand in8 = new CPOperand(parts[8]);
            CPOperand in9 = new CPOperand(parts[9]);
            CPOperand in10 = new CPOperand(parts[10]);
            CPOperand in11 = new CPOperand(parts[11]);
            CPOperand out1 = new CPOperand(parts[12]);
            CPOperand out2 = new CPOperand(parts[13]);
            CPOperand out3 = new CPOperand(parts[14]);
            CPOperand out4 = new CPOperand(parts[15]);
            CPOperand out5 = new CPOperand(parts[16]);
            return new DnnCPInstruction(in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, out1, out2, out3, out4, out5, opcode, str, 0.0);
        }
        throw new DMLRuntimeException("Unknown opcode while parsing a DnnCPInstruction: " + str);
    }

    private static int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index) {
        return (int)ec.getScalarInput(aL.get(index)).getLongValue();
    }

    public void processReluBackwardInstruction(ExecutionContext ec) {
        MatrixBlock input = ec.getMatrixInput(this.input1.getName());
        MatrixBlock dout = ec.getMatrixInput(this._in2.getName());
        MatrixBlock outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), input.isInSparseFormat() || dout.isInSparseFormat());
        if (!input.isEmpty() && !dout.isEmpty()) {
            outputBlock.allocateBlock();
            LibMatrixDNN.reluBackward(input, dout, outputBlock, this._numThreads);
        }
        ec.releaseMatrixInput(this.input1.getName());
        ec.releaseMatrixInput(this._in2.getName());
        ec.setMatrixOutput(this.getOutputVariableName(), outputBlock);
    }

    public void processBiasAddInstruction(ExecutionContext ec) {
        MatrixBlock input = ec.getMatrixInput(this.input1.getName());
        MatrixBlock bias = ec.getMatrixInput(this._in2.getName());
        MatrixBlock outputBlock = null;
        if (bias.getNumColumns() != 1) {
            throw new DMLRuntimeException("Expected the number of columns of bias matrix to be 1, but found " + bias.getNumColumns());
        }
        if (input.isEmpty() && bias.isEmpty()) {
            outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true);
        } else if (bias.isEmpty()) {
            outputBlock = new MatrixBlock(input);
        } else {
            outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), false);
            outputBlock.allocateDenseBlock();
            LibMatrixDNN.biasAdd(input, bias, outputBlock, this._numThreads);
        }
        ec.releaseMatrixInput(this.input1.getName());
        ec.releaseMatrixInput(this._in2.getName());
        ec.setMatrixOutput(this.getOutputVariableName(), outputBlock);
    }

    public void processBiasMultiplyInstruction(ExecutionContext ec) {
        MatrixBlock input = ec.getMatrixInput(this.input1.getName());
        MatrixBlock bias = ec.getMatrixInput(this._in2.getName());
        MatrixBlock outputBlock = null;
        if (bias.getNumColumns() != 1) {
            throw new DMLRuntimeException("Expected the number of columns of bias matrix to be 1, but found " + bias.getNumColumns());
        }
        if (bias.isEmpty()) {
            outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true);
        } else {
            outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), input.isInSparseFormat()).allocateBlock();
            LibMatrixDNN.biasMultiply(input, bias, outputBlock, this._numThreads);
        }
        ec.releaseMatrixInput(this.input1.getName());
        ec.releaseMatrixInput(this._in2.getName());
        ec.setMatrixOutput(this.getOutputVariableName(), outputBlock);
    }

    public void processBatchNorm2dInstruction(ExecutionContext ec) {
        MatrixBlock image = ec.getMatrixInput(this.input1.getName());
        MatrixBlock scale = ec.getMatrixInput(this._in2.getName());
        MatrixBlock bias = ec.getMatrixInput(this._in3.getName());
        MatrixBlock runningMean = ec.getMatrixInput(this._in4.getName());
        MatrixBlock runningVar = ec.getMatrixInput(this._in5.getName());
        String phase = ec.getScalarInput(this._in6).getStringValue();
        double epsilon = ec.getScalarInput(this._in7).getDoubleValue();
        double mu = ec.getScalarInput(this._in8).getDoubleValue();
        MatrixBlock ret = new MatrixBlock(image.getNumRows(), image.getNumColumns(), false).allocateBlock();
        MatrixBlock retRunningMean = new MatrixBlock(runningMean.getNumRows(), runningMean.getNumColumns(), false).allocateBlock();
        MatrixBlock retRunningVar = new MatrixBlock(runningVar.getNumRows(), runningVar.getNumColumns(), false).allocateBlock();
        MatrixBlock resultSaveMean = new MatrixBlock(runningMean.getNumRows(), runningMean.getNumColumns(), false).allocateBlock();
        MatrixBlock resultSaveInvVariance = new MatrixBlock(runningVar.getNumRows(), runningVar.getNumColumns(), false).allocateBlock();
        LibMatrixDNN.batchNorm2D(image, scale, bias, runningMean, runningVar, phase, epsilon, mu, ret, retRunningMean, retRunningVar, resultSaveMean, resultSaveInvVariance);
        ec.releaseMatrixInput(this.input1.getName(), this._in2.getName(), this._in3.getName(), this._in4.getName(), this._in5.getName());
        ec.setMatrixOutput(this.output.getName(), ret);
        ec.setMatrixOutput(this._out2.getName(), retRunningMean);
        ec.setMatrixOutput(this._out3.getName(), retRunningVar);
        ec.setMatrixOutput(this._out4.getName(), resultSaveMean);
        ec.setMatrixOutput(this._out5.getName(), resultSaveInvVariance);
    }

    public void processBatchNorm2dBackwardInstruction(ExecutionContext ec) {
        MatrixBlock image = ec.getMatrixInput(this.input1.getName());
        MatrixBlock dout = ec.getMatrixInput(this._in2.getName());
        MatrixBlock scale = ec.getMatrixInput(this._in3.getName());
        double epsilon = ec.getScalarInput(this._in4).getDoubleValue();
        MatrixBlock resultSaveMean = ec.getMatrixInput(this._in5.getName());
        MatrixBlock resultSaveInvVariance = ec.getMatrixInput(this._in6.getName());
        MatrixBlock dX = new MatrixBlock(image.getNumRows(), image.getNumColumns(), false).allocateBlock();
        MatrixBlock dScale = new MatrixBlock(scale.getNumRows(), scale.getNumColumns(), false).allocateBlock();
        MatrixBlock dBias = new MatrixBlock(scale.getNumRows(), scale.getNumColumns(), false).allocateBlock();
        LibMatrixDNN.batchNorm2DBackward(image, dout, scale, epsilon, resultSaveMean, resultSaveInvVariance, dX, dScale, dBias);
        ec.releaseMatrixInput(this.input1.getName(), this._in2.getName(), this._in3.getName(), this._in5.getName(), this._in6.getName());
        ec.setMatrixOutput(this.output.getName(), dX);
        ec.setMatrixOutput(this._out2.getName(), dScale);
        ec.setMatrixOutput(this._out3.getName(), dBias);
    }

    private static boolean isFilterSparse(MatrixBlock filter) {
        long numElems = filter.getNumRows() * filter.getNumColumns();
        if (filter.isInSparseFormat() && (double)numElems < 1.0E7) {
            filter.sparseToDense();
        }
        return filter.isInSparseFormat();
    }

    private void processLSTMInstruction(ExecutionContext ec, boolean backward) {
        MatrixBlock X = ec.getMatrixInput(this.input1.getName());
        MatrixBlock W = ec.getMatrixInput(this._in2.getName());
        MatrixBlock bias = ec.getMatrixInput(this._in3.getName());
        MatrixBlock out0 = ec.getMatrixInput(this._in4.getName());
        MatrixBlock c0 = ec.getMatrixInput(this._in5.getName());
        boolean return_sequences = ec.getScalarInput(this._in6).getBooleanValue();
        MatrixBlock dout = null;
        MatrixBlock dc = null;
        MatrixBlock cache_out = null;
        MatrixBlock cache_c = null;
        MatrixBlock cache_ifog = null;
        if (backward) {
            dout = ec.getMatrixInput(this._in7.getName());
            dc = ec.getMatrixInput(this._in8.getName());
            cache_out = ec.getMatrixInput(this._in9.getName());
            cache_c = ec.getMatrixInput(this._in10.getName());
            cache_ifog = ec.getMatrixInput(this._in11.getName());
        }
        int M = out0.getNumColumns();
        int N = out0.getNumRows();
        int numRowsW = W.getNumRows();
        int numColsW = W.getNumColumns();
        int D = numRowsW - M;
        int T = X.getNumColumns() / D;
        if (c0.getNumColumns() != out0.getNumColumns() || out0.getNumRows() != c0.getNumRows()) {
            throw new DMLRuntimeException("Incorrect input dimension for LSTM. Expected input4 and input3 Matrix to be of the same Dimension (N, M), but got (" + c0.getNumRows() + ", " + c0.getNumColumns() + ") and (" + out0.getNumRows() + ", " + out0.getNumColumns() + ")");
        }
        if (W.getNumColumns() != 4 * M) {
            throw new DMLRuntimeException("Incorrect input dimension for LSTM. Expected Weight Matrix to be of Dimension (D+M, 4M) = (" + numRowsW + ", " + 4 * M + "), but got (" + numRowsW + ", " + numColsW + ")");
        }
        if (bias.getNumColumns() != 4 * M || bias.getNumRows() != 1) {
            throw new DMLRuntimeException("Incorrect input dimension for LSTM. Expected bias Matrix to be of Dimension (1, 4M) = (1, " + 4 * M + "), but got (" + bias.getNumRows() + ", " + bias.getNumColumns() + ")");
        }
        MatrixBlock out1 = new MatrixBlock(N, backward ? T * D : (return_sequences ? T * M : M), false);
        MatrixBlock out2 = new MatrixBlock(backward ? D + M : N, backward ? 4 * M : M, false);
        MatrixBlock out3 = new MatrixBlock(backward ? 1 : T, backward ? 4 * M : N * M, false);
        MatrixBlock out4 = new MatrixBlock(backward ? N : T, backward ? M : N * M, false);
        MatrixBlock out5 = new MatrixBlock(backward ? N : T, backward ? M : N * 4 * M, false);
        DnnParameters params = new DnnParameters(N, D, T, M, X, W, bias, out0, c0, cache_out, cache_c, cache_ifog, return_sequences, dout, dc, out1, out2, out3, out4, out5, this._numThreads);
        if (backward) {
            LibMatrixDNN.lstmBackward(params);
        } else {
            LibMatrixDNN.lstm(params);
        }
        ec.releaseMatrixInput(this.input1.getName(), this._in2.getName(), this._in3.getName(), this._in4.getName(), this._in5.getName());
        if (backward) {
            ec.releaseMatrixInput(this._in7.getName(), this._in8.getName(), this._in9.getName(), this._in10.getName(), this._in11.getName());
        }
        ec.setMatrixOutput(this.output.getName(), out1);
        ec.setMatrixOutput(this._out2.getName(), out2);
        ec.setMatrixOutput(this._out3.getName(), out3);
        ec.setMatrixOutput(this._out4.getName(), out4);
        ec.setMatrixOutput(this._out5.getName(), out5);
    }

    @Override
    public void processInstruction(ExecutionContext ec) {
        if (this.instOpcode.equalsIgnoreCase(Opcodes.BIAS_ADD.toString())) {
            this.processBiasAddInstruction(ec);
            return;
        }
        if (this.instOpcode.equalsIgnoreCase(Opcodes.BIAS_MULTIPLY.toString())) {
            this.processBiasMultiplyInstruction(ec);
            return;
        }
        if (this.instOpcode.equalsIgnoreCase(Opcodes.RELU_BACKWARD.toString())) {
            this.processReluBackwardInstruction(ec);
            return;
        }
        if (this.instOpcode.equalsIgnoreCase(Opcodes.BATCH_NORM2D.toString())) {
            this.processBatchNorm2dInstruction(ec);
            return;
        }
        if (this.instOpcode.equalsIgnoreCase(Opcodes.BATCH_NORM2D_BACKWARD.toString())) {
            this.processBatchNorm2dBackwardInstruction(ec);
            return;
        }
        if (this.instOpcode.equalsIgnoreCase(Opcodes.LSTM.toString())) {
            this.processLSTMInstruction(ec, false);
            return;
        }
        if (this.instOpcode.equalsIgnoreCase(Opcodes.LSTM_BACKWARD.toString())) {
            this.processLSTMInstruction(ec, true);
            return;
        }
        MatrixBlock outputBlock = null;
        MatrixBlock matBlock = this.instOpcode.equalsIgnoreCase(Opcodes.AVGPOOLING_BACKWARD.toString()) ? null : ec.getMatrixInput(this.input1.getName());
        int pad_h = DnnCPInstruction.getScalarInput(ec, this._padding, 0);
        int pad_w = DnnCPInstruction.getScalarInput(ec, this._padding, 1);
        int stride_h = DnnCPInstruction.getScalarInput(ec, this._stride, 0);
        int stride_w = DnnCPInstruction.getScalarInput(ec, this._stride, 1);
        int N = DnnCPInstruction.getScalarInput(ec, this._input_shape, 0);
        int C = DnnCPInstruction.getScalarInput(ec, this._input_shape, 1);
        int H = DnnCPInstruction.getScalarInput(ec, this._input_shape, 2);
        int W = DnnCPInstruction.getScalarInput(ec, this._input_shape, 3);
        int K2 = DnnCPInstruction.getScalarInput(ec, this._filter_shape, 0);
        int R = DnnCPInstruction.getScalarInput(ec, this._filter_shape, 2);
        int S = DnnCPInstruction.getScalarInput(ec, this._filter_shape, 3);
        int P2 = (int)DnnUtils.getP(H, R, stride_h, pad_h);
        int Q = (int)DnnUtils.getQ(W, S, stride_w, pad_w);
        DnnParameters params = new DnnParameters(N, C, H, W, K2, R, S, stride_h, stride_w, pad_h, pad_w, this._numThreads);
        params.enableNative = NativeHelper.isNativeLibraryLoaded();
        if (this.instOpcode.equalsIgnoreCase(Opcodes.MAXPOOLING.toString()) || this.instOpcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING.toString()) || this.instOpcode.equalsIgnoreCase(Opcodes.AVGPOOLING.toString())) {
            if (matBlock.isEmpty()) {
                outputBlock = new MatrixBlock(N, C * P2 * Q, true);
            } else {
                LibMatrixDNN.PoolingType poolType;
                outputBlock = new MatrixBlock(N, C * P2 * Q, false).allocateBlock();
                LibMatrixDNN.PoolingType poolingType = poolType = this.instOpcode.equalsIgnoreCase(Opcodes.MAXPOOLING.toString()) || this.instOpcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING.toString()) ? LibMatrixDNN.PoolingType.MAX : LibMatrixDNN.PoolingType.AVG;
                if (this.instOpcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING.toString())) {
                    params.minValForMaxPoolOperations = 0.0;
                }
                LibMatrixDNN.pooling(matBlock, outputBlock, params, poolType);
            }
        } else if (this.instOpcode.equalsIgnoreCase(Opcodes.MAXPOOLING_BACKWARD.toString()) || this.instOpcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING_BACKWARD.toString()) || this.instOpcode.equalsIgnoreCase(Opcodes.AVGPOOLING_BACKWARD.toString())) {
            boolean isEmpty;
            MatrixBlock dout = ec.getMatrixInput(this._in2.getName());
            boolean bl = this.instOpcode.equalsIgnoreCase(Opcodes.AVGPOOLING_BACKWARD.toString()) ? dout.isEmpty() : (isEmpty = matBlock.isEmpty() || dout.isEmpty());
            if (isEmpty) {
                outputBlock = new MatrixBlock(N, C * H * W, true);
            } else {
                LibMatrixDNN.PoolingType poolType = this.instOpcode.equalsIgnoreCase(Opcodes.MAXPOOLING_BACKWARD.toString()) || this.instOpcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING_BACKWARD.toString()) ? LibMatrixDNN.PoolingType.MAX : LibMatrixDNN.PoolingType.AVG;
                outputBlock = poolType == LibMatrixDNN.PoolingType.MAX ? new MatrixBlock(N, C * H * W, true).allocateBlock() : new MatrixBlock(N, C * H * W, false).allocateBlock();
                boolean performReLUBackward = this.instOpcode.equalsIgnoreCase(Opcodes.RELU_MAXPOOLING_BACKWARD.toString());
                if (performReLUBackward) {
                    params.minValForMaxPoolOperations = 0.0;
                }
                LibMatrixDNN.poolingBackward(matBlock, dout, outputBlock, params, performReLUBackward, poolType);
            }
            ec.releaseMatrixInput(this._in2.getName());
        } else if (this.instOpcode.equalsIgnoreCase(Opcodes.CONV2D.toString())) {
            this.resetNumThreads(params, C * R * S, P2 * Q, matBlock.getNonZeros() / (long)(matBlock.getNumRows() * matBlock.getNumColumns()));
            MatrixBlock filter = ec.getMatrixInput(this._in2.getName());
            if (filter.isEmpty() || matBlock.isEmpty()) {
                outputBlock = new MatrixBlock(N, K2 * P2 * Q, true);
            } else {
                boolean sparse = matBlock.isUltraSparse(false) && params.bias == null && matBlock.getInMemorySize() < MatrixBlock.estimateSizeDenseInMemory(N, K2 * P2 * Q);
                outputBlock = new MatrixBlock(N, K2 * P2 * Q, sparse).allocateBlock();
                if (params.enableNative && matBlock.isInSparseFormat()) {
                    matBlock.sparseToDense();
                }
                if (params.enableNative && !DnnCPInstruction.isFilterSparse(filter) && !matBlock.isInSparseFormat()) {
                    LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
                } else {
                    LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
                }
            }
            ec.releaseMatrixInput(this._in2.getName());
        } else if (this.instOpcode.equalsIgnoreCase(Opcodes.CONV2D_BIAS_ADD.toString())) {
            boolean isOutputConvEmpty;
            this.resetNumThreads(params, C * R * S, P2 * Q, matBlock.getNonZeros() / (long)(matBlock.getNumRows() * matBlock.getNumColumns()));
            MatrixBlock filter = ec.getMatrixInput(this._in3.getName());
            MatrixBlock bias = ec.getMatrixInput(this._in2.getName());
            if (bias.getNumRows() != params.K || bias.getNumColumns() != 1) {
                throw new DMLRuntimeException("Incorrect shape of bias matrix: [" + bias.getNumRows() + " " + bias.getNumColumns() + "]. Expected: [" + params.K + ", 1]");
            }
            boolean bl = isOutputConvEmpty = filter.isEmpty() || matBlock.isEmpty();
            if (isOutputConvEmpty && bias.isEmpty()) {
                outputBlock = new MatrixBlock(N, K2 * P2 * Q, true);
            } else if (isOutputConvEmpty && !bias.isEmpty()) {
                outputBlock = new MatrixBlock(N, K2 * P2 * Q, false).allocateBlock();
                for (int n = 0; n < params.N; ++n) {
                    DnnUtils.fillBias(bias, outputBlock.getDenseBlockValues(), n, n + 1, params.N, params.K, params.P * params.Q);
                }
            } else {
                outputBlock = new MatrixBlock(N, K2 * P2 * Q, false).allocateBlock();
                if (!bias.isEmpty()) {
                    params.bias = bias;
                }
                if (params.enableNative && matBlock.isInSparseFormat()) {
                    matBlock.sparseToDense();
                }
                if (params.enableNative && !DnnCPInstruction.isFilterSparse(filter) && !matBlock.isInSparseFormat()) {
                    LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
                } else {
                    LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
                }
            }
            ec.releaseMatrixInput(this._in3.getName(), this._in2.getName());
        } else if (this.instOpcode.equalsIgnoreCase(Opcodes.CONV2D_BACKWARD_FILTER.toString())) {
            MatrixBlock dout = ec.getMatrixInput(this._in2.getName());
            if (dout.isEmpty() || matBlock.isEmpty()) {
                outputBlock = new MatrixBlock(K2, C * R * S, true);
            } else {
                outputBlock = new MatrixBlock(K2, C * R * S, false).allocateBlock();
                if (params.enableNative && !matBlock.isInSparseFormat() && !dout.isInSparseFormat()) {
                    LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
                } else {
                    LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
                }
            }
            ec.releaseMatrixInput(this._in2.getName());
        } else if (this.instOpcode.equalsIgnoreCase(Opcodes.CONV2D_BACKWARD_DATA.toString())) {
            MatrixBlock dout = ec.getMatrixInput(this._in2.getName());
            if (dout.isEmpty() || matBlock.isEmpty()) {
                outputBlock = new MatrixBlock(N, C * H * W, true);
            } else {
                outputBlock = new MatrixBlock(N, C * H * W, false).allocateBlock();
                if (params.enableNative && !DnnCPInstruction.isFilterSparse(matBlock) && !dout.isInSparseFormat()) {
                    LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params);
                } else {
                    LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
                }
            }
            ec.releaseMatrixInput(this._in2.getName());
        } else {
            throw new DMLRuntimeException("Unsupported op code " + this.instOpcode);
        }
        if (!this.instOpcode.equalsIgnoreCase(Opcodes.AVGPOOLING_BACKWARD.toString())) {
            ec.releaseMatrixInput(this.input1.getName());
        }
        ec.setMatrixOutput(this.getOutputVariableName(), outputBlock);
    }

    private void resetNumThreads(DnnParameters params, int numRows, int numCols, double sparsity) {
        double memBudget1Thread;
        int limitedDegreeOfParallelism;
        if (DMLScript.USE_ACCELERATOR && params.numThreads > (limitedDegreeOfParallelism = (int)Math.floor(this._intermediateMemoryBudget / (memBudget1Thread = (double)OptimizerUtils.estimateSizeExactSparsity((long)numRows, (long)numCols, sparsity))))) {
            params.numThreads = limitedDegreeOfParallelism;
            if (!warnedUnderUtilitization) {
                LOG.warn((Object)"CPU Under-utilization to respect the intermediate memory budget. To avoid this, please try reducing the mini-batch or forcing gpu execution.");
            }
            warnedUnderUtilitization = true;
        }
    }

    @Override
    public Pair<String, LineageItem> getLineageItem(ExecutionContext ec) {
        ArrayList<CPOperand> inputs = new ArrayList<CPOperand>();
        inputs.add(this.input1);
        inputs.add(this._in2);
        inputs.add(this._in3);
        inputs.add(this._in4);
        inputs.add(this._in5);
        inputs.add(this._in6);
        inputs.add(this._in7);
        inputs.add(this._in8);
        inputs.addAll(this._input_shape);
        inputs.addAll(this._filter_shape);
        inputs.addAll(this._stride);
        inputs.addAll(this._padding);
        return Pair.of((Object)this.output.getName(), (Object)new LineageItem(this.getOpcode(), LineageItemUtils.getLineage(ec, inputs.toArray(new CPOperand[0]))));
    }
}

