/*
 * Decompiled with CFR 0.152.
 */
package de.lmu.ifi.dbs.elki.datasource.parser;

import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
import de.lmu.ifi.dbs.elki.datasource.parser.CSVReaderFormat;
import de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Description(value="This parser expects data in roughly the same format as the NumberVectorLabelParser,\nexcept that it will enumerate all unique strings to always produce numerical values.\nThis way, it can for example handle files that contain lines like 'y,n,y,y,n,y,n'.")
public class CategorialDataAsNumberVectorParser<V extends NumberVector>
extends NumberVectorLabelParser<V> {
    private static final Logging LOG = Logging.getLogger(CategorialDataAsNumberVectorParser.class);
    TObjectIntHashMap<String> unique = new TObjectIntHashMap();
    int ustart = Math.max(this.unique.getNoEntryValue() + 1, 1);
    Matcher nanpattern = Pattern.compile("\\?").matcher("Dummy text");

    public CategorialDataAsNumberVectorParser(NumberVector.Factory<V> factory) {
        this(CSVReaderFormat.DEFAULT_FORMAT, null, factory);
    }

    public CategorialDataAsNumberVectorParser(CSVReaderFormat cSVReaderFormat, long[] lArray, NumberVector.Factory<V> factory) {
        super(cSVReaderFormat, lArray, factory);
    }

    @Override
    public BundleStreamSource.Event nextEvent() {
        BundleStreamSource.Event event = super.nextEvent();
        if (event == BundleStreamSource.Event.END_OF_STREAM) {
            this.unique.clear();
        }
        return event;
    }

    @Override
    protected boolean parseLineInternal() {
        int n = 0;
        while (this.tokenizer.valid()) {
            block7: {
                if (!this.isLabelColumn(n)) {
                    try {
                        this.attributes.add(this.tokenizer.getDouble());
                    }
                    catch (NumberFormatException numberFormatException) {
                        String string = this.tokenizer.getSubstring();
                        if (this.nanpattern.reset(string).matches()) {
                            this.attributes.add(Double.NaN);
                            break block7;
                        }
                        int n2 = this.unique.get((Object)string);
                        if (n2 == this.unique.getNoEntryValue()) {
                            n2 = this.ustart + this.unique.size();
                            this.unique.put((Object)string, n2);
                        }
                        this.attributes.add(n2);
                    }
                } else {
                    this.haslabels = true;
                    this.labels.add(this.tokenizer.getSubstring());
                }
            }
            this.tokenizer.advance();
            ++n;
        }
        this.curvec = this.createVector();
        this.curlbl = LabelList.make(this.labels);
        this.attributes.clear();
        this.labels.clear();
        return true;
    }

    @Override
    protected Logging getLogger() {
        return LOG;
    }

    public static class Parameterizer<V extends NumberVector>
    extends NumberVectorLabelParser.Parameterizer<V> {
        @Override
        protected CategorialDataAsNumberVectorParser<V> makeInstance() {
            return new CategorialDataAsNumberVectorParser(this.format, this.labelIndices, this.factory);
        }
    }
}

