/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.scoring.webgraph;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebGraph
extends Configured
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String LOCK_NAME = ".locked";
    public static final String INLINK_DIR = "inlinks";
    public static final String OUTLINK_DIR = "outlinks/current";
    public static final String OLD_OUTLINK_DIR = "outlinks/old";
    public static final String NODE_DIR = "nodes";

    public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException, InterruptedException, ClassNotFoundException {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("WebGraphDb: starting");
        LOG.info("WebGraphDb: webgraphdb: {}", (Object)webGraphDb);
        LOG.info("WebGraphDb: URL normalize: {}", (Object)normalize);
        LOG.info("WebGraphDb: URL filter: {}", (Object)filter);
        FileSystem fs = webGraphDb.getFileSystem(this.getConf());
        Path lock = new Path(webGraphDb, LOCK_NAME);
        if (!fs.exists(webGraphDb)) {
            fs.mkdirs(webGraphDb);
        }
        LockUtil.createLockFile(fs, lock, false);
        Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR);
        Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR);
        if (!fs.exists(outlinkDb)) {
            fs.mkdirs(outlinkDb);
        }
        Path tempOutlinkDb = new Path(String.valueOf(outlinkDb) + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Job outlinkJob = Job.getInstance((Configuration)this.getConf(), (String)("Nutch WebGraph: outlinkdb " + String.valueOf(outlinkDb)));
        Configuration outlinkJobConf = outlinkJob.getConfiguration();
        boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false);
        boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true);
        if (deleteGone) {
            LOG.info("OutlinkDb: deleting gone links");
        }
        if (segments != null) {
            for (int i = 0; i < segments.length; ++i) {
                Path crawlFetch;
                Path parseData;
                FileSystem sfs = segments[i].getFileSystem(outlinkJobConf);
                if (sfs.exists(parseData = new Path(segments[i], "parse_data"))) {
                    LOG.info("OutlinkDb: adding input: {}", (Object)parseData);
                    FileInputFormat.addInputPath((Job)outlinkJob, (Path)parseData);
                }
                if (!deleteGone || !sfs.exists(crawlFetch = new Path(segments[i], "crawl_fetch"))) continue;
                LOG.info("OutlinkDb: adding input: {}", (Object)crawlFetch);
                FileInputFormat.addInputPath((Job)outlinkJob, (Path)crawlFetch);
            }
        }
        LOG.info("OutlinkDb: adding input: {}", (Object)outlinkDb);
        FileInputFormat.addInputPath((Job)outlinkJob, (Path)outlinkDb);
        outlinkJobConf.setBoolean("webgraph.url.normalizers", normalize);
        outlinkJobConf.setBoolean("webgraph.url.filters", filter);
        outlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
        outlinkJob.setJarByClass(OutlinkDb.class);
        outlinkJob.setMapperClass(OutlinkDb.OutlinkDbMapper.class);
        outlinkJob.setReducerClass(OutlinkDb.OutlinkDbReducer.class);
        outlinkJob.setMapOutputKeyClass(Text.class);
        outlinkJob.setMapOutputValueClass(NutchWritable.class);
        outlinkJob.setOutputKeyClass(Text.class);
        outlinkJob.setOutputValueClass(LinkDatum.class);
        FileOutputFormat.setOutputPath((Job)outlinkJob, (Path)tempOutlinkDb);
        outlinkJob.setOutputFormatClass(MapFileOutputFormat.class);
        outlinkJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("OutlinkDb: running");
            boolean success = outlinkJob.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("OutlinkDb", outlinkJob);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(tempOutlinkDb, lock, fs);
                throw new RuntimeException(message);
            }
            LOG.info("OutlinkDb: installing {}", (Object)outlinkDb);
            FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
            FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
            if (!preserveBackup && fs.exists(oldOutlinkDb)) {
                fs.delete(oldOutlinkDb, true);
            }
            LOG.info("OutlinkDb: finished");
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("OutlinkDb failed:", (Throwable)e);
            NutchJob.cleanupAfterFailure(tempOutlinkDb, lock, fs);
            throw e;
        }
        Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
        Path tempInlinkDb = new Path(String.valueOf(inlinkDb) + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Job inlinkJob = Job.getInstance((Configuration)this.getConf(), (String)("Nutch WebGraph: inlinkdb " + String.valueOf(inlinkDb)));
        Configuration inlinkJobConf = inlinkJob.getConfiguration();
        LOG.info("InlinkDb: adding input: {}", (Object)outlinkDb);
        FileInputFormat.addInputPath((Job)inlinkJob, (Path)outlinkDb);
        inlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
        inlinkJob.setJarByClass(InlinkDb.class);
        inlinkJob.setMapperClass(InlinkDb.InlinkDbMapper.class);
        inlinkJob.setMapOutputKeyClass(Text.class);
        inlinkJob.setMapOutputValueClass(LinkDatum.class);
        inlinkJob.setOutputKeyClass(Text.class);
        inlinkJob.setOutputValueClass(LinkDatum.class);
        FileOutputFormat.setOutputPath((Job)inlinkJob, (Path)tempInlinkDb);
        inlinkJob.setOutputFormatClass(MapFileOutputFormat.class);
        inlinkJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("InlinkDb: running");
            boolean success = inlinkJob.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("InlinkDb", inlinkJob);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(tempInlinkDb, lock, fs);
                throw new RuntimeException(message);
            }
            LOG.info("InlinkDb: installing {}", (Object)inlinkDb);
            FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
            LOG.info("InlinkDb: finished");
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("InlinkDb failed:", (Throwable)e);
            NutchJob.cleanupAfterFailure(tempInlinkDb, lock, fs);
            throw e;
        }
        Path nodeDb = new Path(webGraphDb, NODE_DIR);
        Path tempNodeDb = new Path(String.valueOf(nodeDb) + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Job nodeJob = Job.getInstance((Configuration)this.getConf(), (String)("Nutch WebGraph: nodedb " + String.valueOf(nodeDb)));
        Configuration nodeJobConf = nodeJob.getConfiguration();
        LOG.info("NodeDb: adding input: {}", (Object)outlinkDb);
        LOG.info("NodeDb: adding input: {}", (Object)inlinkDb);
        FileInputFormat.addInputPath((Job)nodeJob, (Path)outlinkDb);
        FileInputFormat.addInputPath((Job)nodeJob, (Path)inlinkDb);
        nodeJob.setInputFormatClass(SequenceFileInputFormat.class);
        nodeJob.setJarByClass(NodeDb.class);
        nodeJob.setReducerClass(NodeDb.NodeDbReducer.class);
        nodeJob.setMapOutputKeyClass(Text.class);
        nodeJob.setMapOutputValueClass(LinkDatum.class);
        nodeJob.setOutputKeyClass(Text.class);
        nodeJob.setOutputValueClass(Node.class);
        FileOutputFormat.setOutputPath((Job)nodeJob, (Path)tempNodeDb);
        nodeJob.setOutputFormatClass(MapFileOutputFormat.class);
        nodeJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("NodeDb: running");
            boolean success = nodeJob.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("NodeDb", nodeJob);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(tempNodeDb, lock, fs);
                throw new RuntimeException(message);
            }
            LOG.info("NodeDb: installing {}", (Object)nodeDb);
            FSUtils.replace(fs, nodeDb, tempNodeDb, true);
            LOG.info("NodeDb: finished");
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("NodeDb failed:", (Throwable)e);
            NutchJob.cleanupAfterFailure(tempNodeDb, lock, fs);
            throw e;
        }
        LockUtil.removeLockFile(fs, lock);
        stopWatch.stop();
        LOG.info("WebGraphDb: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new WebGraph(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        Option helpOpt = new Option("h", "help", false, "show this help message");
        Option normOpt = new Option("n", "normalize", false, "whether to use URLNormalizers on the URL's in the segment");
        Option filtOpt = new Option("f", "filter", false, "whether to use URLFilters on the URL's in the segment");
        OptionBuilder.withArgName((String)"webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"the web graph database to create (if none exists) or use if one does");
        Option graphOpt = OptionBuilder.create((String)"webgraphdb");
        OptionBuilder.withArgName((String)"segment");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"the segment(s) to use");
        Option segOpt = OptionBuilder.create((String)"segment");
        OptionBuilder.withArgName((String)"segmentDir");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"the segment directory to use");
        Option segDirOpt = OptionBuilder.create((String)"segmentDir");
        Options options = new Options();
        options.addOption(helpOpt);
        options.addOption(normOpt);
        options.addOption(filtOpt);
        options.addOption(graphOpt);
        options.addOption(segOpt);
        options.addOption(segDirOpt);
        GnuParser parser = new GnuParser();
        try {
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("segment") && !line.hasOption("segmentDir")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("WebGraph", options, true);
                return -1;
            }
            String webGraphDb = line.getOptionValue("webgraphdb");
            Path[] segPaths = null;
            if (line.hasOption("segment")) {
                String[] segments = line.getOptionValues("segment");
                segPaths = new Path[segments.length];
                for (int i = 0; i < segments.length; ++i) {
                    segPaths[i] = new Path(segments[i]);
                }
            }
            if (line.hasOption("segmentDir")) {
                Path dir = new Path(line.getOptionValue("segmentDir"));
                FileSystem fs = dir.getFileSystem(this.getConf());
                FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
                segPaths = HadoopFSUtil.getPaths(fstats);
            }
            boolean normalize = false;
            if (line.hasOption("normalize")) {
                normalize = true;
            }
            boolean filter = false;
            if (line.hasOption("filter")) {
                filter = true;
            }
            this.createWebGraph(new Path(webGraphDb), segPaths, normalize, filter);
            return 0;
        }
        catch (Exception e) {
            LOG.error("WebGraph:", (Throwable)e);
            return -2;
        }
    }

    private static class NodeDb
    extends Configured {
        private NodeDb() {
        }

        public static class NodeDbReducer
        extends Reducer<Text, LinkDatum, Text, Node> {
            public void reduce(Text key, Iterable<LinkDatum> values, Reducer.Context context) throws IOException, InterruptedException {
                Node node = new Node();
                int numInlinks = 0;
                int numOutlinks = 0;
                for (LinkDatum next : values) {
                    if (next.getLinkType() == 1) {
                        ++numInlinks;
                        continue;
                    }
                    if (next.getLinkType() != 2) continue;
                    ++numOutlinks;
                }
                node.setNumInlinks(numInlinks);
                node.setNumOutlinks(numOutlinks);
                node.setInlinkScore(0.0f);
                context.write((Object)key, (Object)node);
            }
        }
    }

    private static class InlinkDb
    extends Configured {
        private static long timestamp;

        private InlinkDb() {
        }

        public static class InlinkDbMapper
        extends Mapper<Text, LinkDatum, Text, LinkDatum> {
            public void setup(Mapper.Context context) {
                timestamp = System.currentTimeMillis();
            }

            public void map(Text key, LinkDatum datum, Mapper.Context context) throws IOException, InterruptedException {
                String fromUrl = key.toString();
                String toUrl = datum.getUrl();
                String anchor = datum.getAnchor();
                LinkDatum inlink = new LinkDatum(fromUrl, anchor, timestamp);
                inlink.setLinkType((byte)1);
                context.write((Object)new Text(toUrl), (Object)inlink);
            }
        }
    }

    public static class OutlinkDb
    extends Configured {
        public static final String URL_NORMALIZING = "webgraph.url.normalizers";
        public static final String URL_FILTERING = "webgraph.url.filters";

        private static long getFetchTime(ParseData data) {
            long fetchTime = System.currentTimeMillis();
            String fetchTimeStr = data.getContentMeta().get("_ftk_");
            try {
                fetchTime = Long.parseLong(fetchTimeStr);
            }
            catch (Exception e) {
                fetchTime = System.currentTimeMillis();
            }
            return fetchTime;
        }

        public OutlinkDb() {
        }

        public OutlinkDb(Configuration conf) {
            this.setConf(conf);
        }

        public static class OutlinkDbReducer
        extends Reducer<Text, NutchWritable, Text, LinkDatum> {
            private boolean ignoreDomain = true;
            private boolean ignoreHost = true;
            private boolean limitPages = true;
            private boolean limitDomains = true;
            private Configuration conf;
            private Counter addedLinksCounter;
            private Counter removedLinksCounter;

            public void setup(Reducer.Context context) {
                Configuration config;
                this.conf = config = context.getConfiguration();
                this.ignoreHost = this.conf.getBoolean("link.ignore.internal.host", true);
                this.ignoreDomain = this.conf.getBoolean("link.ignore.internal.domain", true);
                this.limitPages = this.conf.getBoolean("link.ignore.limit.page", true);
                this.limitDomains = this.conf.getBoolean("link.ignore.limit.domain", true);
                this.initCounters(context);
            }

            private void initCounters(Reducer.Context context) {
                this.addedLinksCounter = context.getCounter("nutch_webgraph", "added_links_total");
                this.removedLinksCounter = context.getCounter("nutch_webgraph", "removed_links_total");
            }

            public void reduce(Text key, Iterable<NutchWritable> values, Reducer.Context context) throws IOException, InterruptedException {
                long mostRecent = 0L;
                ArrayList<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
                for (NutchWritable val : values) {
                    BooleanWritable delete;
                    Writable value = val.get();
                    if (value instanceof LinkDatum) {
                        LinkDatum next = (LinkDatum)value;
                        long timestamp = next.getTimestamp();
                        if (mostRecent == 0L || mostRecent < timestamp) {
                            mostRecent = timestamp;
                        }
                        outlinkList.add((LinkDatum)WritableUtils.clone((Writable)next, (Configuration)this.conf));
                        this.addedLinksCounter.increment(1L);
                        continue;
                    }
                    if (!(value instanceof BooleanWritable) || !(delete = (BooleanWritable)value).get()) continue;
                    this.removedLinksCounter.increment(1L);
                    return;
                }
                String url = key.toString();
                String domain = URLUtil.getDomainName(url);
                String host = URLUtil.getHost(url);
                HashSet<String> domains = new HashSet<String>();
                HashSet<String> pages = new HashSet<String>();
                for (LinkDatum datum : outlinkList) {
                    String toUrl = datum.getUrl();
                    String toDomain = URLUtil.getDomainName(toUrl);
                    String toHost = URLUtil.getHost(toUrl);
                    String toPage = URLUtil.getPage(toUrl);
                    datum.setLinkType((byte)2);
                    if (datum.getTimestamp() != mostRecent || this.limitPages && (!this.limitPages || pages.contains(toPage)) || this.limitDomains && (!this.limitDomains || domains.contains(toDomain)) || this.ignoreHost && (!this.ignoreHost || toHost.equalsIgnoreCase(host)) || this.ignoreDomain && (!this.ignoreDomain || toDomain.equalsIgnoreCase(domain))) continue;
                    context.write((Object)key, (Object)datum);
                    pages.add(toPage);
                    domains.add(toDomain);
                }
            }
        }

        public static class OutlinkDbMapper
        extends Mapper<Text, Writable, Text, NutchWritable> {
            private boolean normalize = false;
            private boolean filter = false;
            private URLNormalizers urlNormalizers;
            private URLFilters filters;
            private Configuration conf;

            private String normalizeUrl(String url) {
                if (!this.normalize) {
                    return url;
                }
                String normalized = null;
                if (this.urlNormalizers != null) {
                    try {
                        normalized = this.urlNormalizers.normalize(url, "default");
                        normalized = normalized.trim();
                    }
                    catch (Exception e) {
                        LOG.warn("Skipping {}:{}", (Object)url, (Object)e);
                        normalized = null;
                    }
                }
                return normalized;
            }

            private String filterUrl(String url) {
                if (!this.filter) {
                    return url;
                }
                try {
                    url = this.filters.filter(url);
                }
                catch (Exception e) {
                    url = null;
                }
                return url;
            }

            public void setup(Mapper.Context context) {
                Configuration config;
                this.conf = config = context.getConfiguration();
                this.normalize = this.conf.getBoolean(OutlinkDb.URL_NORMALIZING, false);
                this.filter = this.conf.getBoolean(OutlinkDb.URL_FILTERING, false);
                if (this.normalize) {
                    this.urlNormalizers = new URLNormalizers(this.conf, "default");
                }
                if (this.filter) {
                    this.filters = new URLFilters(this.conf);
                }
            }

            public void map(Text key, Writable value, Mapper.Context context) throws IOException, InterruptedException {
                LinkDatum datum;
                String linkDatumUrl;
                String url = this.normalizeUrl(key.toString());
                if (url == null) {
                    return;
                }
                if (this.filterUrl(url) == null) {
                    return;
                }
                key.set(url);
                if (value instanceof CrawlDatum) {
                    CrawlDatum datum2 = (CrawlDatum)value;
                    if (datum2.getStatus() == 35 || datum2.getStatus() == 36 || datum2.getStatus() == 37) {
                        context.write((Object)key, (Object)new NutchWritable((Writable)new BooleanWritable(true)));
                    }
                } else if (value instanceof ParseData) {
                    ParseData data = (ParseData)value;
                    long fetchTime = OutlinkDb.getFetchTime(data);
                    Outlink[] outlinkAr = data.getOutlinks();
                    LinkedHashMap<String, String> outlinkMap = new LinkedHashMap<String, String>();
                    if (outlinkAr != null && outlinkAr.length > 0) {
                        for (int i = 0; i < outlinkAr.length; ++i) {
                            Outlink outlink = outlinkAr[i];
                            String toUrl = this.normalizeUrl(outlink.getToUrl());
                            if (this.filterUrl(toUrl) == null) continue;
                            boolean existingUrl = outlinkMap.containsKey(toUrl);
                            if (toUrl == null || existingUrl && (!existingUrl || outlinkMap.get(toUrl) != null)) continue;
                            outlinkMap.put(toUrl, outlink.getAnchor());
                        }
                    }
                    for (String outlinkUrl : outlinkMap.keySet()) {
                        String anchor = (String)outlinkMap.get(outlinkUrl);
                        LinkDatum datum3 = new LinkDatum(outlinkUrl, anchor, fetchTime);
                        context.write((Object)key, (Object)new NutchWritable(datum3));
                    }
                } else if (value instanceof LinkDatum && this.filterUrl(linkDatumUrl = this.normalizeUrl((datum = (LinkDatum)value).getUrl())) != null) {
                    datum.setUrl(linkDatumUrl);
                    context.write((Object)key, (Object)new NutchWritable(datum));
                }
            }
        }
    }
}

