/*
 * Decompiled with CFR 0.152.
 */
package de.lmu.ifi.dbs.elki.datasource.parser;

import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.parser.CSVReaderFormat;
import de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import gnu.trove.iterator.TIntDoubleIterator;
import gnu.trove.map.TIntDoubleMap;
import gnu.trove.map.TObjectIntMap;
import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.util.ArrayList;

@Title(value="Term frequency parser")
@Description(value="Parse a file containing term frequencies. The expected format is 'label term1 <freq> term2 <freq> ...'. Terms must not contain the separator character!")
public class TermFrequencyParser<V extends SparseNumberVector>
extends NumberVectorLabelParser<V> {
    private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class);
    int numterms;
    TObjectIntMap<String> keymap;
    boolean normalize;
    private SparseNumberVector.Factory<V> sparsefactory;
    TIntDoubleHashMap values = new TIntDoubleHashMap();
    ArrayList<String> labels = new ArrayList();

    public TermFrequencyParser(boolean bl, CSVReaderFormat cSVReaderFormat, long[] lArray, SparseNumberVector.Factory<V> factory) {
        super(cSVReaderFormat, lArray, factory);
        this.normalize = bl;
        this.keymap = new TObjectIntHashMap(1001, 0.5f, -1);
        this.sparsefactory = factory;
    }

    @Override
    protected boolean parseLineInternal() {
        double d = 0.0;
        String string = null;
        int n = 0;
        while (this.tokenizer.valid()) {
            if (this.isLabelColumn(n++)) {
                this.labels.add(this.tokenizer.getSubstring());
            } else if (string == null) {
                string = this.tokenizer.getSubstring();
            } else {
                try {
                    double d2 = this.tokenizer.getDouble();
                    int n2 = this.keymap.get((Object)string);
                    if (n2 < 0) {
                        n2 = this.numterms++;
                        this.keymap.put((Object)string, n2);
                    }
                    this.values.put(n2, d2);
                    d += d2;
                    string = null;
                }
                catch (NumberFormatException numberFormatException) {
                    if (string != null) {
                        this.labels.add(string);
                    }
                    string = this.tokenizer.getSubstring();
                }
            }
            this.tokenizer.advance();
        }
        if (string != null) {
            this.labels.add(string);
        }
        this.haslabels |= this.labels.size() > 0;
        if (this.normalize && Math.abs(d - 1.0) > Double.MIN_NORMAL) {
            TIntDoubleIterator tIntDoubleIterator = this.values.iterator();
            while (tIntDoubleIterator.hasNext()) {
                tIntDoubleIterator.advance();
                tIntDoubleIterator.setValue(tIntDoubleIterator.value() / d);
            }
        }
        this.curvec = this.sparsefactory.newNumberVector((TIntDoubleMap)this.values, this.numterms);
        this.curlbl = LabelList.make(this.labels);
        this.values.clear();
        this.labels.clear();
        return true;
    }

    @Override
    protected SimpleTypeInformation<V> getTypeInformation(int n, int n2) {
        if (n == n2) {
            return new VectorFieldTypeInformation(this.factory, n);
        }
        if (n < n2) {
            return new VectorTypeInformation(this.factory, this.factory.getDefaultSerializer(), n, n2);
        }
        throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
    }

    @Override
    protected Logging getLogger() {
        return LOG;
    }

    public static class Parameterizer<V extends SparseNumberVector>
    extends NumberVectorLabelParser.Parameterizer<V> {
        public static final OptionID NORMALIZE_FLAG = new OptionID("tf.normalize", "Normalize vectors to manhattan length 1 (convert term counts to term frequencies)");
        boolean normalize = false;

        @Override
        protected void makeOptions(Parameterization parameterization) {
            super.makeOptions(parameterization);
            Flag flag = new Flag(NORMALIZE_FLAG);
            if (parameterization.grab(flag)) {
                this.normalize = flag.isTrue();
            }
        }

        @Override
        protected void getFactory(Parameterization parameterization) {
            ObjectParameter objectParameter = new ObjectParameter(VECTOR_TYPE_ID, (Class<?>)SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
            if (parameterization.grab(objectParameter)) {
                this.factory = (NumberVector.Factory)objectParameter.instantiateClass(parameterization);
            }
        }

        @Override
        protected TermFrequencyParser<V> makeInstance() {
            return new TermFrequencyParser(this.normalize, this.format, this.labelIndices, (SparseNumberVector.Factory)this.factory);
        }
    }
}

