/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import org.apache.commons.jexl3.JexlContext;
import org.apache.commons.jexl3.JexlScript;
import org.apache.commons.jexl3.MapContext;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.FetchSchedule;
import org.apache.nutch.crawl.FetchScheduleFactory;
import org.apache.nutch.crawl.URLPartitioner;
import org.apache.nutch.hostdb.HostDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Generator
extends NutchTool
implements Tool {
    private static final Random RANDOM = new Random();
    protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
    public static final String GENERATOR_MIN_SCORE = "generate.min.score";
    public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
    public static final String GENERATOR_RESTRICT_STATUS = "generate.restrict.status";
    public static final String GENERATOR_FILTER = "generate.filter";
    public static final String GENERATOR_NORMALISE = "generate.normalise";
    public static final String GENERATOR_MAX_COUNT = "generate.max.count";
    public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
    public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
    public static final String GENERATOR_COUNT_VALUE_HOST = "host";
    public static final String GENERATOR_TOP_N = "generate.topN";
    public static final String GENERATOR_CUR_TIME = "generate.curTime";
    public static final String GENERATOR_DELAY = "crawl.gen.delay";
    public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
    public static final String GENERATOR_EXPR = "generate.expr";
    public static final String GENERATOR_HOSTDB = "generate.hostdb";
    public static final String GENERATOR_MAX_COUNT_EXPR = "generate.max.count.expr";
    public static final String GENERATOR_FETCH_DELAY_EXPR = "generate.fetch.delay.expr";
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    public Generator() {
    }

    public Generator(Configuration conf) {
        this.setConf(conf);
    }

    public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime) throws IOException, InterruptedException, ClassNotFoundException {
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch Generator: generate from " + String.valueOf(dbDir)));
        Configuration conf = job.getConfiguration();
        boolean filter = conf.getBoolean(GENERATOR_FILTER, true);
        boolean normalise = conf.getBoolean(GENERATOR_NORMALISE, true);
        return this.generate(dbDir, segments, numLists, topN, curTime, filter, normalise, false, 1, null);
    }

    @Deprecated
    public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean force) throws IOException, InterruptedException, ClassNotFoundException {
        return this.generate(dbDir, segments, numLists, topN, curTime, filter, true, force, 1, null);
    }

    public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments, String expr) throws IOException, InterruptedException, ClassNotFoundException {
        return this.generate(dbDir, segments, numLists, topN, curTime, filter, norm, force, maxNumSegments, expr, null);
    }

    public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments, String expr, String hostdb) throws IOException, InterruptedException, ClassNotFoundException {
        Path tempDir = new Path(this.getConf().get("mapreduce.cluster.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());
        FileSystem fs = tempDir.getFileSystem(this.getConf());
        Path lock = CrawlDb.lock(this.getConf(), dbDir, force);
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("Generator: starting");
        LOG.info("Generator: selecting best-scoring urls due for fetch.");
        LOG.info("Generator: filtering: {}", (Object)filter);
        LOG.info("Generator: normalizing: {}", (Object)norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("Generator: topN: {}", (Object)topN);
        }
        if (expr != null) {
            LOG.info("Generator: expr: {}", (Object)expr);
        }
        if (hostdb != null) {
            LOG.info("Generator: hostdb: {}", (Object)hostdb);
        }
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch Generator: generate from " + String.valueOf(dbDir)));
        Configuration conf = job.getConfiguration();
        if (numLists == -1) {
            numLists = Integer.parseInt(conf.get("mapreduce.job.maps"));
        }
        if ("local".equals(conf.get("mapreduce.framework.name")) && numLists != 1) {
            LOG.info("Generator: running in local mode, generating exactly one partition.");
            numLists = 1;
        }
        conf.setLong(GENERATOR_CUR_TIME, curTime);
        long generateTime = System.currentTimeMillis();
        conf.setLong("_ngt_", generateTime);
        conf.setLong(GENERATOR_TOP_N, topN);
        conf.setBoolean(GENERATOR_FILTER, filter);
        conf.setBoolean(GENERATOR_NORMALISE, norm);
        conf.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);
        if (expr != null) {
            conf.set(GENERATOR_EXPR, expr);
        }
        if (hostdb != null) {
            conf.set(GENERATOR_HOSTDB, hostdb);
        }
        FileInputFormat.addInputPath((Job)job, (Path)new Path(dbDir, "current"));
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(Selector.class);
        job.setMapperClass(SelectorMapper.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(SelectorReducer.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)tempDir);
        job.setOutputKeyClass(FloatWritable.class);
        job.setSortComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        MultipleOutputs.addNamedOutput((Job)job, (String)"sequenceFiles", SequenceFileOutputFormat.class, FloatWritable.class, SelectorEntry.class);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("Generator", job);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(tempDir, lock, fs);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("Generator job failed: {}", (Object)e.getMessage());
            NutchJob.cleanupAfterFailure(tempDir, lock, fs);
            throw e;
        }
        LOG.info("Generator: number of items rejected during selection:");
        for (Counter counter : (CounterGroup)job.getCounters().getGroup("Generator")) {
            LOG.info("Generator: {}  {}", (Object)String.format(Locale.ROOT, "%6d", counter.getValue()), (Object)counter.getName());
        }
        if (!this.getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            LockUtil.removeLockFile(this.getConf(), lock);
            lock = null;
        }
        ArrayList<Path> generatedSegments = new ArrayList<Path>();
        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) continue;
                Path newSeg = this.partitionSegment(segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        }
        catch (Exception e) {
            LOG.warn("Generator: exception while partitioning segments, exiting ...");
            NutchJob.cleanupAfterFailure(tempDir, lock, fs);
            return null;
        }
        if (generatedSegments.size() == 0) {
            LOG.warn("Generator: 0 records selected for fetching, exiting ...");
            NutchJob.cleanupAfterFailure(tempDir, lock, fs);
            return null;
        }
        if (this.getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            Path tempDir2 = new Path(dbDir, "generate-temp-" + UUID.randomUUID().toString());
            job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch Generator: updatedb " + String.valueOf(dbDir)));
            job.getConfiguration().setLong("_ngt_", generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, "crawl_generate");
                FileInputFormat.addInputPath((Job)job, (Path)subGenDir);
            }
            FileInputFormat.addInputPath((Job)job, (Path)new Path(dbDir, "current"));
            job.setInputFormatClass(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.CrawlDbUpdateMapper.class);
            job.setReducerClass(CrawlDbUpdater.CrawlDbUpdateReducer.class);
            job.setJarByClass(CrawlDbUpdater.class);
            job.setOutputFormatClass(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath((Job)job, (Path)tempDir2);
            try {
                boolean success = job.waitForCompletion(true);
                if (!success) {
                    String message = NutchJob.getJobFailureLogMessage("Generator", job);
                    LOG.error(message);
                    NutchJob.cleanupAfterFailure(tempDir, lock, fs);
                    NutchJob.cleanupAfterFailure(tempDir2, lock, fs);
                    throw new RuntimeException(message);
                }
                CrawlDb.install(job, dbDir);
            }
            catch (IOException | ClassNotFoundException | InterruptedException e) {
                LOG.error("Generator job failed: {}", (Object)e.getMessage());
                NutchJob.cleanupAfterFailure(tempDir, lock, fs);
                NutchJob.cleanupAfterFailure(tempDir2, lock, fs);
                throw e;
            }
            fs.delete(tempDir2, true);
        }
        if (lock != null) {
            LockUtil.removeLockFile(this.getConf(), lock);
        }
        fs.delete(tempDir, true);
        stopWatch.stop();
        LOG.info("Generator: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    }

    private Path partitionSegment(Path segmentsDir, Path inputDir, int numLists) throws IOException, ClassNotFoundException, InterruptedException {
        LOG.info("Generator: Partitioning selected urls for politeness.");
        Path segment = new Path(segmentsDir, Generator.generateSegmentName());
        Path output = new Path(segment, "crawl_generate");
        LOG.info("Generator: segment: {}", (Object)segment);
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch Generator: partition segment " + String.valueOf(segment)));
        Configuration conf = job.getConfiguration();
        conf.setInt("partition.url.seed", RANDOM.nextInt());
        FileInputFormat.addInputPath((Job)job, (Path)inputDir);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(Generator.class);
        job.setMapperClass(SelectorInverseMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(SelectorEntry.class);
        job.setPartitionerClass(URLPartitioner.class);
        job.setReducerClass(PartitionReducer.class);
        job.setNumReduceTasks(numLists);
        FileOutputFormat.setOutputPath((Job)job, (Path)output);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setSortComparatorClass(HashComparator.class);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("Generator", job);
                LOG.error(message);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error(StringUtils.stringifyException((Throwable)e));
            throw e;
        }
        return segment;
    }

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000L);
        }
        catch (Throwable throwable) {
            // empty catch block
        }
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new Generator(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.out.println("Usage: Generator <crawldb> <segments_dir> [-hostdb <hostdb>] [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
            return -1;
        }
        Path dbDir = new Path(args[0]);
        Path segmentsDir = new Path(args[1]);
        String hostdb = null;
        long curTime = System.currentTimeMillis();
        long topN = Long.MAX_VALUE;
        int numFetchers = -1;
        boolean filter = true;
        boolean norm = true;
        boolean force = false;
        String expr = null;
        int maxNumSegments = 1;
        for (int i = 2; i < args.length; ++i) {
            if ("-topN".equals(args[i])) {
                topN = Long.parseLong(args[i + 1]);
                ++i;
                continue;
            }
            if ("-numFetchers".equals(args[i])) {
                numFetchers = Integer.parseInt(args[i + 1]);
                ++i;
                continue;
            }
            if ("-hostdb".equals(args[i])) {
                hostdb = args[i + 1];
                ++i;
                continue;
            }
            if ("-adddays".equals(args[i])) {
                long numDays = Integer.parseInt(args[i + 1]);
                curTime += numDays * 1000L * 60L * 60L * 24L;
                continue;
            }
            if ("-noFilter".equals(args[i])) {
                filter = false;
                continue;
            }
            if ("-noNorm".equals(args[i])) {
                norm = false;
                continue;
            }
            if ("-force".equals(args[i])) {
                force = true;
                continue;
            }
            if ("-maxNumSegments".equals(args[i])) {
                maxNumSegments = Integer.parseInt(args[i + 1]);
                continue;
            }
            if (!"-expr".equals(args[i])) continue;
            expr = args[i + 1];
        }
        try {
            Path[] segs = this.generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter, norm, force, maxNumSegments, expr, hostdb);
            if (segs == null) {
                return 1;
            }
        }
        catch (Exception e) {
            LOG.error("Generator:", (Throwable)e);
            return -1;
        }
        return 0;
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        Object segDir;
        Object crawldbPath;
        HashMap<String, Object> results = new HashMap<String, Object>();
        long curTime = System.currentTimeMillis();
        long topN = Long.MAX_VALUE;
        int numFetchers = -1;
        boolean filter = true;
        boolean norm = true;
        boolean force = false;
        int maxNumSegments = 1;
        String expr = null;
        String hostdb = null;
        Path crawlDb = args.containsKey("crawldb") ? ((crawldbPath = args.get("crawldb")) instanceof Path ? (Path)crawldbPath : new Path(crawldbPath.toString())) : new Path(crawlId + "/crawldb");
        Path segmentsDir = args.containsKey("segment_dir") ? ((segDir = args.get("segment_dir")) instanceof Path ? (Path)segDir : new Path(segDir.toString())) : new Path(crawlId + "/segments");
        if (args.containsKey("hostdb")) {
            hostdb = (String)args.get("hostdb");
        }
        if (args.containsKey("expr")) {
            expr = (String)args.get("expr");
        }
        if (args.containsKey("topN")) {
            topN = Long.parseLong((String)args.get("topN"));
        }
        if (args.containsKey("numFetchers")) {
            numFetchers = Integer.parseInt((String)args.get("numFetchers"));
        }
        if (args.containsKey("adddays")) {
            long numDays = Integer.parseInt((String)args.get("adddays"));
            curTime += numDays * 1000L * 60L * 60L * 24L;
        }
        if (args.containsKey("noFilter")) {
            filter = false;
        }
        if (args.containsKey("noNorm")) {
            norm = false;
        }
        if (args.containsKey("force")) {
            force = true;
        }
        if (args.containsKey("maxNumSegments")) {
            maxNumSegments = Integer.parseInt((String)args.get("maxNumSegments"));
        }
        try {
            Path[] segs = this.generate(crawlDb, segmentsDir, numFetchers, topN, curTime, filter, norm, force, maxNumSegments, expr, hostdb);
            if (segs == null) {
                results.put("result", Integer.toString(1));
                return results;
            }
        }
        catch (Exception e) {
            LOG.error("Generator: {}", (Object)StringUtils.stringifyException((Throwable)e));
            results.put("result", Integer.toString(-1));
            return results;
        }
        results.put("result", Integer.toString(0));
        return results;
    }

    public static class CrawlDbUpdater {

        public static class CrawlDbUpdateReducer
        extends Reducer<Text, CrawlDatum, Text, CrawlDatum> {
            private CrawlDatum orig = new CrawlDatum();
            private LongWritable genTime = new LongWritable(0L);
            private long generateTime;

            public void setup(Reducer.Context context) {
                Configuration conf = context.getConfiguration();
                this.generateTime = conf.getLong("_ngt_", 0L);
            }

            public void reduce(Text key, Iterable<CrawlDatum> values, Reducer.Context context) throws IOException, InterruptedException {
                this.genTime.set(0L);
                for (CrawlDatum val : values) {
                    if (val.getMetaData().containsKey((Object)Nutch.WRITABLE_GENERATE_TIME_KEY)) {
                        LongWritable gt = (LongWritable)val.getMetaData().get((Object)Nutch.WRITABLE_GENERATE_TIME_KEY);
                        this.genTime.set(gt.get());
                        if (this.genTime.get() == this.generateTime) continue;
                        this.orig.set(val);
                        this.genTime.set(0L);
                        continue;
                    }
                    this.orig.set(val);
                }
                if (this.genTime.get() != 0L) {
                    this.orig.getMetaData().put((Writable)Nutch.WRITABLE_GENERATE_TIME_KEY, (Writable)this.genTime);
                }
                context.write((Object)key, (Object)this.orig);
            }
        }

        public static class CrawlDbUpdateMapper
        extends Mapper<Text, CrawlDatum, Text, CrawlDatum> {
            public void map(Text key, CrawlDatum value, Mapper.Context context) throws IOException, InterruptedException {
                context.write((Object)key, (Object)value);
            }
        }
    }

    public static class HashComparator
    extends WritableComparator {
        public HashComparator() {
            super(Text.class);
        }

        public int compare(WritableComparable a, WritableComparable b) {
            int hash2;
            Text url1 = (Text)a;
            Text url2 = (Text)b;
            int hash1 = HashComparator.hash(url1.getBytes(), 0, url1.getLength());
            return hash1 < (hash2 = HashComparator.hash(url2.getBytes(), 0, url2.getLength())) ? -1 : (hash1 == hash2 ? 0 : 1);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            int hash2;
            int hash1 = HashComparator.hash(b1, s1, l1);
            return hash1 < (hash2 = HashComparator.hash(b2, s2, l2)) ? -1 : (hash1 == hash2 ? 0 : 1);
        }

        private static int hash(byte[] bytes, int start, int length) {
            int hash = 1;
            for (int i = length - 1; i >= 0; --i) {
                hash = 31 * hash + bytes[start + i];
            }
            return hash;
        }
    }

    public static class PartitionReducer
    extends Reducer<Text, SelectorEntry, Text, CrawlDatum> {
        public void reduce(Text key, Iterable<SelectorEntry> values, Reducer.Context context) throws IOException, InterruptedException {
            for (SelectorEntry entry : values) {
                context.write((Object)entry.url, (Object)entry.datum);
            }
        }
    }

    public static class SelectorInverseMapper
    extends Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
        public void map(FloatWritable key, SelectorEntry value, Mapper.Context context) throws IOException, InterruptedException {
            SelectorEntry entry = value;
            context.write((Object)entry.url, (Object)entry);
        }
    }

    public static class DecreasingFloatComparator
    extends FloatWritable.Comparator {
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            return super.compare(b2, s2, l2, b1, s1, l1);
        }
    }

    public static class SelectorReducer
    extends Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
        private HashMap<String, int[]> hostCounts = new HashMap();
        private long count;
        private int currentsegmentnum = 1;
        private MultipleOutputs<FloatWritable, SelectorEntry> mos;
        private String outputFile;
        private long limit;
        private int[] segCounts;
        private int maxNumSegments = 1;
        private int maxCount;
        private Configuration conf;
        private boolean byDomain = false;
        private URLNormalizers normalizers;
        private static boolean normalise;
        private SequenceFile.Reader[] hostdbReaders = null;
        private JexlScript maxCountExpr = null;
        private JexlScript fetchDelayExpr = null;
        private Map<String, HostDatum> hostDatumCache = new HashMap<String, HostDatum>();

        public void readHostDb() throws IOException {
            if (this.conf.get(Generator.GENERATOR_HOSTDB) == null) {
                return;
            }
            Path path = new Path(this.conf.get(Generator.GENERATOR_HOSTDB), "current");
            this.hostdbReaders = SegmentReaderUtil.getReaders(path, this.conf);
            try {
                Text key = new Text();
                HostDatum value = new HostDatum();
                for (int i = 0; i < this.hostdbReaders.length; ++i) {
                    while (this.hostdbReaders[i].next((Writable)key, (Writable)value)) {
                        this.hostDatumCache.put(key.toString(), (HostDatum)value.clone());
                    }
                }
            }
            catch (Exception e) {
                throw new IOException(e);
            }
            for (int i = 0; i < this.hostdbReaders.length; ++i) {
                this.hostdbReaders[i].close();
            }
        }

        private JexlContext createContext(HostDatum datum) {
            MapContext context = new MapContext();
            context.set("dnsFailures", (Object)datum.getDnsFailures());
            context.set("connectionFailures", (Object)datum.getConnectionFailures());
            context.set("unfetched", (Object)datum.getUnfetched());
            context.set("fetched", (Object)datum.getFetched());
            context.set("notModified", (Object)datum.getNotModified());
            context.set("redirTemp", (Object)datum.getRedirTemp());
            context.set("redirPerm", (Object)datum.getRedirPerm());
            context.set("gone", (Object)datum.getGone());
            context.set("conf", (Object)this.conf);
            if (datum.hasMetaData()) {
                for (Map.Entry entry : datum.getMetaData().entrySet()) {
                    Text tkey;
                    Object value = entry.getValue();
                    if (value instanceof FloatWritable) {
                        FloatWritable fvalue = (FloatWritable)value;
                        tkey = (Text)entry.getKey();
                        context.set(tkey.toString(), (Object)Float.valueOf(fvalue.get()));
                    }
                    if (value instanceof IntWritable) {
                        IntWritable ivalue = (IntWritable)value;
                        tkey = (Text)entry.getKey();
                        context.set(tkey.toString(), (Object)ivalue.get());
                    }
                    if (!(value instanceof Text)) continue;
                    Text tvalue = (Text)value;
                    tkey = (Text)entry.getKey();
                    context.set(tkey.toString().replace("-", "_"), (Object)tvalue.toString());
                }
            }
            return context;
        }

        public void setup(Reducer.Context context) throws IOException {
            this.conf = context.getConfiguration();
            this.mos = new MultipleOutputs((TaskInputOutputContext)context);
            Job job = Job.getInstance((Configuration)this.conf, (String)"Nutch Generator.SelectorReducer");
            this.limit = this.conf.getLong(Generator.GENERATOR_TOP_N, Long.MAX_VALUE) / (long)job.getNumReduceTasks();
            this.maxNumSegments = this.conf.getInt(Generator.GENERATOR_MAX_NUM_SEGMENTS, 1);
            this.segCounts = new int[this.maxNumSegments];
            this.maxCount = this.conf.getInt(Generator.GENERATOR_MAX_COUNT, -1);
            if (this.maxCount == -1) {
                this.byDomain = false;
            }
            if (Generator.GENERATOR_COUNT_VALUE_DOMAIN.equals(this.conf.get(Generator.GENERATOR_COUNT_MODE))) {
                this.byDomain = true;
            }
            if (normalise = this.conf.getBoolean(Generator.GENERATOR_NORMALISE, true)) {
                this.normalizers = new URLNormalizers(this.conf, "generate_host_count");
            }
            if (this.conf.get(Generator.GENERATOR_HOSTDB) != null) {
                this.maxCountExpr = JexlUtil.parseExpression(this.conf.get(Generator.GENERATOR_MAX_COUNT_EXPR, null));
                this.fetchDelayExpr = JexlUtil.parseExpression(this.conf.get(Generator.GENERATOR_FETCH_DELAY_EXPR, null));
            }
            this.readHostDb();
        }

        public void cleanup(Reducer.Context context) throws IOException, InterruptedException {
            this.mos.close();
        }

        public void reduce(FloatWritable key, Iterable<SelectorEntry> values, Reducer.Context context) throws IOException, InterruptedException {
            String currentHostname = null;
            HostDatum host = null;
            LongWritable variableFetchDelayWritable = null;
            Text variableFetchDelayKey = new Text("_variableFetchDelay_");
            int maxCount = this.maxCount;
            for (SelectorEntry entry : values) {
                Text url = entry.url;
                String urlString = url.toString();
                URL u = null;
                String hostname = URLUtil.getHost(urlString);
                if (hostname == null) {
                    currentHostname = hostname;
                } else if (!hostname.equals(currentHostname)) {
                    currentHostname = hostname;
                    host = this.hostDatumCache.get(hostname);
                    if (host != null) {
                        if (this.maxCountExpr != null) {
                            try {
                                long variableMaxCount = Math.round((Double)this.maxCountExpr.execute(this.createContext(host)));
                                LOG.debug("Generator: variable maxCount: {} for {}", (Object)variableMaxCount, (Object)hostname);
                                maxCount = (int)variableMaxCount;
                            }
                            catch (Exception e) {
                                LOG.error("Unable to execute variable maxCount expression because: {}", (Object)e.getMessage(), (Object)e);
                            }
                        }
                        if (this.fetchDelayExpr != null) {
                            try {
                                long variableFetchDelay = Math.round((Double)this.fetchDelayExpr.execute(this.createContext(host)));
                                LOG.debug("Generator: variable fetchDelay: {} ms for {}", (Object)variableFetchDelay, (Object)hostname);
                                variableFetchDelayWritable = new LongWritable(variableFetchDelay);
                            }
                            catch (Exception e) {
                                LOG.error("Unable to execute fetch delay expression because: {}", (Object)e.getMessage(), (Object)e);
                            }
                        }
                    }
                }
                if (variableFetchDelayWritable != null) {
                    entry.datum.getMetaData().put((Writable)variableFetchDelayKey, variableFetchDelayWritable);
                }
                if (this.count == this.limit) {
                    if (this.currentsegmentnum >= this.maxNumSegments) break;
                    this.count = 0L;
                    ++this.currentsegmentnum;
                }
                String hostordomain = null;
                try {
                    if (normalise && this.normalizers != null) {
                        urlString = this.normalizers.normalize(urlString, "generate_host_count");
                    }
                    u = new URL(urlString);
                    hostordomain = this.byDomain ? URLUtil.getDomainName(u) : u.getHost();
                }
                catch (MalformedURLException e) {
                    LOG.warn("Malformed URL: '{}', skipping ({})", (Object)urlString, (Object)StringUtils.stringifyException((Throwable)e));
                    context.getCounter("Generator", "MALFORMED_URL").increment(1L);
                    continue;
                }
                hostordomain = hostordomain.toLowerCase();
                if (maxCount > 0) {
                    int[] hostCount = this.hostCounts.get(hostordomain);
                    if (hostCount == null) {
                        hostCount = new int[]{1, 0};
                        this.hostCounts.put(hostordomain, hostCount);
                    }
                    hostCount[1] = hostCount[1] + 1;
                    while ((long)this.segCounts[hostCount[0] - 1] >= this.limit && hostCount[0] < this.maxNumSegments) {
                        hostCount[0] = hostCount[0] + 1;
                        hostCount[1] = 0;
                    }
                    if (hostCount[1] > maxCount) {
                        if (hostCount[0] < this.maxNumSegments) {
                            hostCount[0] = hostCount[0] + 1;
                            hostCount[1] = 1;
                        } else {
                            if (hostCount[1] == maxCount + 1) {
                                context.getCounter("Generator", "HOSTS_AFFECTED_PER_HOST_OVERFLOW").increment(1L);
                                LOG.info("Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", new Object[]{hostordomain, maxCount, this.maxNumSegments});
                            }
                            context.getCounter("Generator", "URLS_SKIPPED_PER_HOST_OVERFLOW").increment(1L);
                            continue;
                        }
                    }
                    entry.segnum = new IntWritable(hostCount[0]);
                    int n = hostCount[0] - 1;
                    this.segCounts[n] = this.segCounts[n] + 1;
                } else {
                    entry.segnum = new IntWritable(this.currentsegmentnum);
                    int n = this.currentsegmentnum - 1;
                    this.segCounts[n] = this.segCounts[n] + 1;
                }
                this.outputFile = this.generateFileName(entry);
                this.mos.write("sequenceFiles", (Object)key, (Object)entry, this.outputFile);
                ++this.count;
            }
        }

        private String generateFileName(SelectorEntry entry) {
            return "fetchlist-" + entry.segnum.toString() + "/part";
        }
    }

    public static class SelectorMapper
    extends Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry> {
        private LongWritable genTime = new LongWritable(System.currentTimeMillis());
        private long curTime;
        private Configuration conf;
        private URLFilters filters;
        private ScoringFilters scfilters;
        private SelectorEntry entry = new SelectorEntry();
        private FloatWritable sortValue = new FloatWritable();
        private boolean filter;
        private long genDelay;
        private FetchSchedule schedule;
        private float scoreThreshold = 0.0f;
        private int intervalThreshold = -1;
        private byte restrictStatus = (byte)-1;
        private JexlScript expr = null;

        public void setup(Mapper.Context context) throws IOException {
            this.conf = context.getConfiguration();
            this.curTime = this.conf.getLong(Generator.GENERATOR_CUR_TIME, System.currentTimeMillis());
            this.filters = new URLFilters(this.conf);
            this.scfilters = new ScoringFilters(this.conf);
            this.filter = this.conf.getBoolean(Generator.GENERATOR_FILTER, true);
            this.genDelay = this.conf.getLong(Generator.GENERATOR_DELAY, 604800000L);
            long time = this.conf.getLong("_ngt_", 0L);
            if (time > 0L) {
                this.genTime.set(time);
            }
            this.schedule = FetchScheduleFactory.getFetchSchedule(this.conf);
            this.scoreThreshold = this.conf.getFloat(Generator.GENERATOR_MIN_SCORE, Float.NaN);
            this.intervalThreshold = this.conf.getInt(Generator.GENERATOR_MIN_INTERVAL, -1);
            String restrictStatusString = this.conf.getTrimmed(Generator.GENERATOR_RESTRICT_STATUS, "");
            if (!restrictStatusString.isEmpty()) {
                this.restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
            }
            this.expr = JexlUtil.parseExpression(this.conf.get(Generator.GENERATOR_EXPR, null));
        }

        public void map(Text key, CrawlDatum value, Mapper.Context context) throws IOException, InterruptedException {
            CrawlDatum crawlDatum;
            Text url = key;
            if (this.filter) {
                try {
                    if (this.filters.filter(url.toString()) == null) {
                        context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1L);
                        return;
                    }
                }
                catch (URLFilterException e) {
                    context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1L);
                    LOG.warn("Couldn't filter url: {} ({})", (Object)url, (Object)e.getMessage());
                }
            }
            if (!this.schedule.shouldFetch(url, crawlDatum = value, this.curTime)) {
                LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", new Object[]{url, crawlDatum.getFetchTime(), this.curTime});
                context.getCounter("Generator", "SCHEDULE_REJECTED").increment(1L);
                return;
            }
            LongWritable oldGenTime = (LongWritable)crawlDatum.getMetaData().get((Object)Nutch.WRITABLE_GENERATE_TIME_KEY);
            if (oldGenTime != null && oldGenTime.get() + this.genDelay > this.curTime) {
                context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1L);
                return;
            }
            float sort = 1.0f;
            try {
                sort = this.scfilters.generatorSortValue(key, crawlDatum, sort);
            }
            catch (ScoringFilterException sfe) {
                LOG.warn("Couldn't filter generatorSortValue for {}: {}", (Object)key, (Object)sfe);
            }
            if (this.expr != null && !crawlDatum.execute(this.expr, key.toString())) {
                context.getCounter("Generator", "EXPR_REJECTED").increment(1L);
                return;
            }
            if (this.restrictStatus != -1 && this.restrictStatus != crawlDatum.getStatus()) {
                context.getCounter("Generator", "STATUS_REJECTED").increment(1L);
                return;
            }
            if (!Float.isNaN(this.scoreThreshold) && sort < this.scoreThreshold) {
                context.getCounter("Generator", "SCORE_TOO_LOW").increment(1L);
                return;
            }
            if (this.intervalThreshold != -1 && crawlDatum.getFetchInterval() > this.intervalThreshold) {
                context.getCounter("Generator", "INTERVAL_REJECTED").increment(1L);
                return;
            }
            this.sortValue.set(sort);
            crawlDatum.getMetaData().put((Writable)Nutch.WRITABLE_GENERATE_TIME_KEY, (Writable)this.genTime);
            this.entry.datum = crawlDatum;
            this.entry.url = key;
            context.write((Object)this.sortValue, (Object)this.entry);
        }
    }

    public static class Selector
    extends Partitioner<FloatWritable, Writable>
    implements Configurable {
        private final URLPartitioner partitioner = new URLPartitioner();

        public int getPartition(FloatWritable key, Writable value, int numReduceTasks) {
            return this.partitioner.getPartition(((SelectorEntry)value).url, (Writable)key, numReduceTasks);
        }

        public Configuration getConf() {
            return this.partitioner.getConf();
        }

        public void setConf(Configuration conf) {
            this.partitioner.setConf(conf);
        }
    }

    public static class SelectorEntry
    implements Writable {
        public Text url = new Text();
        public CrawlDatum datum = new CrawlDatum();
        public IntWritable segnum = new IntWritable(0);

        public void readFields(DataInput in) throws IOException {
            this.url.readFields(in);
            this.datum.readFields(in);
            this.segnum.readFields(in);
        }

        public void write(DataOutput out) throws IOException {
            this.url.write(out);
            this.datum.write(out);
            this.segnum.write(out);
        }

        public String toString() {
            return "url=" + this.url.toString() + ", datum=" + this.datum.toString() + ", segnum=" + this.segnum.toString();
        }
    }
}

