/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.clustering.streaming.mapreduce;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
import org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansMapper;
import org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansReducer;
import org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansThread;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.math.Centroid;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.neighborhood.BruteSearch;
import org.apache.mahout.math.neighborhood.ProjectionSearch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class StreamingKMeansDriver
extends AbstractJob {
    public static final String ESTIMATED_NUM_MAP_CLUSTERS = "estimatedNumMapClusters";
    public static final String ESTIMATED_DISTANCE_CUTOFF = "estimatedDistanceCutoff";
    public static final String MAX_NUM_ITERATIONS = "maxNumIterations";
    public static final String TRIM_FRACTION = "trimFraction";
    public static final String RANDOM_INIT = "randomInit";
    public static final String IGNORE_WEIGHTS = "ignoreWeights";
    public static final String TEST_PROBABILITY = "testProbability";
    public static final String NUM_BALLKMEANS_RUNS = "numBallKMeansRuns";
    public static final String SEARCHER_CLASS_OPTION = "searcherClass";
    public static final String NUM_PROJECTIONS_OPTION = "numProjections";
    public static final String SEARCH_SIZE_OPTION = "searchSize";
    public static final String REDUCE_STREAMING_KMEANS = "reduceStreamingKMeans";
    private static final Logger log = LoggerFactory.getLogger(StreamingKMeansDriver.class);
    public static final float INVALID_DISTANCE_CUTOFF = -1.0f;

    public int run(String[] args) throws Exception {
        this.addInputOption();
        this.addOutputOption();
        this.addOption(DefaultOptionCreator.overwriteOption().create());
        this.addOption(DefaultOptionCreator.numClustersOption().withDescription("The k in k-Means. Approximately this many clusters will be generated.").create());
        this.addOption(ESTIMATED_NUM_MAP_CLUSTERS, "km", "The estimated number of clusters to use for the Map phase of the job when running StreamingKMeans. This should be around k * log(n), where k is the final number of clusters and n is the total number of data points to cluster.");
        this.addOption(ESTIMATED_DISTANCE_CUTOFF, "e", "The initial estimated distance cutoff between two points for forming new clusters. If no value is given, it's estimated from the data set", String.valueOf(-1.0f));
        this.addOption(MAX_NUM_ITERATIONS, "mi", "The maximum number of iterations to run for the BallKMeans algorithm used by the reducer. If no value is given, defaults to 10.", String.valueOf(10));
        this.addOption(TRIM_FRACTION, "tf", "The 'ball' aspect of ball k-means means that only the closest points to the centroid will actually be used for updating. The fraction of the points to be used is those points whose distance to the center is within trimFraction * distance to the closest other center. If no value is given, defaults to 0.9.", String.valueOf(0.9));
        this.addFlag(RANDOM_INIT, "ri", "Whether to use k-means++ initialization or random initialization of the seed centroids. Essentially, k-means++ provides better clusters, but takes longer, whereas random initialization takes less time, but produces worse clusters, and tends to fail more often and needs multiple runs to compare to k-means++. If set, uses the random initialization.");
        this.addFlag(IGNORE_WEIGHTS, "iw", "Whether to correct the weights of the centroids after the clustering is done. The weights end up being wrong because of the trimFraction and possible train/test splits. In some cases, especially in a pipeline, having an accurate count of the weights is useful. If set, ignores the final weights");
        this.addOption(TEST_PROBABILITY, "testp", "A double value between 0 and 1 that represents the percentage of points to be used for 'testing' different clustering runs in the final BallKMeans step. If no value is given, defaults to 0.1", String.valueOf(0.1));
        this.addOption(NUM_BALLKMEANS_RUNS, "nbkm", "Number of BallKMeans runs to use at the end to try to cluster the points. If no value is given, defaults to 4", String.valueOf(4));
        this.addOption(DefaultOptionCreator.distanceMeasureOption().create());
        this.addOption(SEARCHER_CLASS_OPTION, "sc", "The type of searcher to be used when performing nearest neighbor searches. Defaults to ProjectionSearch.", ProjectionSearch.class.getCanonicalName());
        this.addOption(NUM_PROJECTIONS_OPTION, "np", "The number of projections considered in estimating the distances between vectors. Only used when the distance measure requested is either ProjectionSearch or FastProjectionSearch. If no value is given, defaults to 3.", String.valueOf(3));
        this.addOption(SEARCH_SIZE_OPTION, "s", "In more efficient searches (non BruteSearch), not all distances are calculated for determining the nearest neighbors. The number of elements whose distances from the query vector is actually computer is proportional to searchSize. If no value is given, defaults to 1.", String.valueOf(2));
        this.addFlag(REDUCE_STREAMING_KMEANS, "rskm", "There might be too many intermediate clusters from the mapper to fit into memory, so the reducer can run another pass of StreamingKMeans to collapse them down to a fewer clusters");
        this.addOption(DefaultOptionCreator.methodOption().create());
        if (this.parseArguments(args) == null) {
            return -1;
        }
        Path output = this.getOutputPath();
        if (this.hasOption("overwrite")) {
            HadoopUtil.delete(this.getConf(), output);
        }
        this.configureOptionsForWorkers();
        StreamingKMeansDriver.run(this.getConf(), this.getInputPath(), output);
        return 0;
    }

    private void configureOptionsForWorkers() throws ClassNotFoundException {
        log.info("Starting to configure options for workers");
        String method = this.getOption("method");
        int numClusters = Integer.parseInt(this.getOption("numClusters"));
        int estimatedNumMapClusters = Integer.parseInt(this.getOption(ESTIMATED_NUM_MAP_CLUSTERS));
        float estimatedDistanceCutoff = Float.parseFloat(this.getOption(ESTIMATED_DISTANCE_CUTOFF));
        int maxNumIterations = Integer.parseInt(this.getOption(MAX_NUM_ITERATIONS));
        float trimFraction = Float.parseFloat(this.getOption(TRIM_FRACTION));
        boolean randomInit = this.hasOption(RANDOM_INIT);
        boolean ignoreWeights = this.hasOption(IGNORE_WEIGHTS);
        float testProbability = Float.parseFloat(this.getOption(TEST_PROBABILITY));
        int numBallKMeansRuns = Integer.parseInt(this.getOption(NUM_BALLKMEANS_RUNS));
        String measureClass = this.getOption("distanceMeasure");
        String searcherClass = this.getOption(SEARCHER_CLASS_OPTION);
        boolean getSearchSize = false;
        boolean getNumProjections = false;
        if (!searcherClass.equals(BruteSearch.class.getName())) {
            getSearchSize = true;
            getNumProjections = true;
        }
        int searchSize = 0;
        if (getSearchSize) {
            searchSize = Integer.parseInt(this.getOption(SEARCH_SIZE_OPTION));
        }
        int numProjections = 0;
        if (getNumProjections) {
            numProjections = Integer.parseInt(this.getOption(NUM_PROJECTIONS_OPTION));
        }
        boolean reduceStreamingKMeans = this.hasOption(REDUCE_STREAMING_KMEANS);
        StreamingKMeansDriver.configureOptionsForWorkers(this.getConf(), numClusters, estimatedNumMapClusters, estimatedDistanceCutoff, maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns, measureClass, searcherClass, searchSize, numProjections, method, reduceStreamingKMeans);
    }

    public static void configureOptionsForWorkers(Configuration conf, int numClusters, int estimatedNumMapClusters, float estimatedDistanceCutoff, int maxNumIterations, float trimFraction, boolean randomInit, boolean ignoreWeights, float testProbability, int numBallKMeansRuns, String measureClass, String searcherClass, int searchSize, int numProjections, String method, boolean reduceStreamingKMeans) throws ClassNotFoundException {
        Preconditions.checkArgument(numClusters > 0, "Invalid number of clusters requested: " + numClusters + ". Must be: numClusters > 0!");
        Preconditions.checkArgument(estimatedNumMapClusters > numClusters, "Invalid number of estimated map clusters; There must be more than the final number of clusters (k log n vs k)");
        Preconditions.checkArgument(estimatedDistanceCutoff == -1.0f || estimatedDistanceCutoff > 0.0f, "estimatedDistanceCutoff must be equal to -1 or must be greater then 0!");
        Preconditions.checkArgument(maxNumIterations > 0, "Must have at least one BallKMeans iteration");
        Preconditions.checkArgument(trimFraction > 0.0f, "trimFraction must be positive");
        Preconditions.checkArgument(testProbability >= 0.0f && testProbability < 1.0f, "test probability is not in the interval [0, 1)");
        Preconditions.checkArgument(numBallKMeansRuns > 0, "numBallKMeans cannot be negative");
        if (!searcherClass.contains("Brute")) {
            Preconditions.checkArgument(searchSize > 0, "Invalid searchSize. Must be positive.");
            if (searcherClass.contains("Projection")) {
                Preconditions.checkArgument(numProjections > 0, "Invalid numProjections. Must be positive");
            }
        }
        conf.setInt("numClusters", numClusters);
        conf.setInt(ESTIMATED_NUM_MAP_CLUSTERS, estimatedNumMapClusters);
        if (estimatedDistanceCutoff != -1.0f) {
            conf.setFloat(ESTIMATED_DISTANCE_CUTOFF, estimatedDistanceCutoff);
        }
        conf.setInt(MAX_NUM_ITERATIONS, maxNumIterations);
        conf.setFloat(TRIM_FRACTION, trimFraction);
        conf.setBoolean(RANDOM_INIT, randomInit);
        conf.setBoolean(IGNORE_WEIGHTS, ignoreWeights);
        conf.setFloat(TEST_PROBABILITY, testProbability);
        conf.setInt(NUM_BALLKMEANS_RUNS, numBallKMeansRuns);
        Class.forName(measureClass);
        conf.set("distanceMeasure", measureClass);
        Class.forName(searcherClass);
        conf.set(SEARCHER_CLASS_OPTION, searcherClass);
        conf.setInt(SEARCH_SIZE_OPTION, searchSize);
        conf.setInt(NUM_PROJECTIONS_OPTION, numProjections);
        conf.set("method", method);
        conf.setBoolean(REDUCE_STREAMING_KMEANS, reduceStreamingKMeans);
        log.info("Parameters are: [k] numClusters {}; [SKM] estimatedNumMapClusters {}; estimatedDistanceCutoff {} [BKM] maxNumIterations {}; trimFraction {}; randomInit {}; ignoreWeights {}; testProbability {}; numBallKMeansRuns {}; [S] measureClass {}; searcherClass {}; searcherSize {}; numProjections {}; method {}; reduceStreamingKMeans {}", numClusters, estimatedNumMapClusters, Float.valueOf(estimatedDistanceCutoff), maxNumIterations, Float.valueOf(trimFraction), randomInit, ignoreWeights, Float.valueOf(testProbability), numBallKMeansRuns, measureClass, searcherClass, searchSize, numProjections, method, reduceStreamingKMeans);
    }

    public static int run(Configuration conf, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException, ExecutionException {
        log.info("Starting StreamingKMeans clustering for vectors in {}; results are output to {}", (Object)input.toString(), (Object)output.toString());
        if (conf.get("method", "mapreduce").equals("sequential")) {
            return StreamingKMeansDriver.runSequentially(conf, input, output);
        }
        return StreamingKMeansDriver.runMapReduce(conf, input, output);
    }

    /*
     * WARNING - void declaration
     */
    private static int runSequentially(Configuration conf, Path input, Path output) throws IOException, ExecutionException, InterruptedException {
        void var9_10;
        long start = System.currentTimeMillis();
        ExecutorService pool = Executors.newCachedThreadPool();
        ArrayList<Future<Iterable<Centroid>>> intermediateCentroidFutures = Lists.newArrayList();
        FileStatus[] arr$ = HadoopUtil.listStatus(FileSystem.get((Configuration)conf), input, PathFilters.logsCRCFilter());
        int len$ = arr$.length;
        boolean bl = false;
        while (var9_10 < len$) {
            FileStatus status = arr$[var9_10];
            intermediateCentroidFutures.add(pool.submit(new StreamingKMeansThread(status.getPath(), conf)));
            ++var9_10;
        }
        log.info("Finished running Mappers");
        ArrayList<Centroid> intermediateCentroids = Lists.newArrayList();
        for (Future future : intermediateCentroidFutures) {
            for (Centroid centroid : (Iterable)future.get()) {
                intermediateCentroids.add(centroid);
            }
        }
        pool.shutdown();
        pool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
        log.info("Finished StreamingKMeans");
        SequenceFile.Writer writer = SequenceFile.createWriter((FileSystem)FileSystem.get((Configuration)conf), (Configuration)conf, (Path)new Path(output, "part-r-00000"), IntWritable.class, CentroidWritable.class);
        boolean bl2 = false;
        for (Vector finalVector : StreamingKMeansReducer.getBestCentroids(intermediateCentroids, conf)) {
            void var9_13;
            Centroid finalCentroid = (Centroid)finalVector;
            writer.append((Writable)new IntWritable((int)(++var9_13)), (Writable)new CentroidWritable(finalCentroid));
        }
        writer.close();
        long end = System.currentTimeMillis();
        log.info("Finished BallKMeans. Took {}.", (Object)((double)(end - start) / 1000.0));
        return 0;
    }

    public static int runMapReduce(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class, StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class, conf);
        job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), (JobContext)job, StreamingKMeansMapper.class, StreamingKMeansReducer.class));
        job.setNumReduceTasks(1);
        job.setJarByClass(StreamingKMeansDriver.class);
        long start = System.currentTimeMillis();
        if (!job.waitForCompletion(true)) {
            return -1;
        }
        long end = System.currentTimeMillis();
        log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", (Object)output.toString(), (Object)(end - start));
        return 0;
    }

    private StreamingKMeansDriver() {
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run((Tool)new StreamingKMeansDriver(), (String[])args);
    }
}

