/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLExemptionFilters;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ParseOutputFormat
extends OutputFormat<Text, Parse> {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private URLFilters filters;
    private URLExemptionFilters exemptionFilters;
    private URLNormalizers normalizers;
    private ScoringFilters scfilters;
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();

    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
        Path path = FileOutputFormat.getOutputPath((JobContext)context);
        return new FileOutputCommitter(path, context);
    }

    public void checkOutputSpecs(JobContext context) throws IOException {
        Configuration conf = context.getConfiguration();
        Path out = FileOutputFormat.getOutputPath((JobContext)context);
        FileSystem fs = out.getFileSystem(context.getConfiguration());
        if (fs == null) {
            fs = out.getFileSystem(conf);
        }
        if (fs.exists(new Path(out, "crawl_parse"))) {
            throw new IOException("Segment already parsed!");
        }
    }

    public String getUniqueFile(TaskAttemptContext context, String name) {
        TaskID taskId = context.getTaskAttemptID().getTaskID();
        int partition = taskId.getId();
        StringBuilder result = new StringBuilder();
        result.append(name);
        result.append('-');
        result.append(TaskID.getRepresentingCharacter((TaskType)taskId.getTaskType()));
        result.append('-');
        result.append(NUMBER_FORMAT.format(partition));
        return result.toString();
    }

    public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context) throws IOException {
        MapFile.Writer textOut;
        Configuration conf = context.getConfiguration();
        String name = this.getUniqueFile(context, "part");
        Path dir = FileOutputFormat.getOutputPath((JobContext)context);
        FileSystem fs = dir.getFileSystem(context.getConfiguration());
        if (conf.getBoolean("parse.filter.urls", true)) {
            this.filters = new URLFilters(conf);
            this.exemptionFilters = new URLExemptionFilters(conf);
        }
        if (conf.getBoolean("parse.normalize.urls", true)) {
            this.normalizers = new URLNormalizers(conf, "outlink");
        }
        this.scfilters = new ScoringFilters(conf);
        final int interval = conf.getInt("db.fetch.interval.default", 2592000);
        final boolean ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
        final boolean ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
        final String ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
        boolean storeText = conf.getBoolean("parser.store.text", true);
        int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
        final int maxOutlinks = maxOutlinksPerPage < 0 ? Integer.MAX_VALUE : maxOutlinksPerPage;
        int maxOutlinkL = conf.getInt("db.max.outlink.length", 4096);
        final int maxOutlinkLength = maxOutlinkL < 0 ? Integer.MAX_VALUE : maxOutlinkL;
        final boolean isParsing = conf.getBoolean("fetcher.parse", true);
        SequenceFile.CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType((JobContext)context);
        Path out = FileOutputFormat.getOutputPath((JobContext)context);
        Path text = new Path(new Path(out, "parse_text"), name);
        Path data = new Path(new Path(out, "parse_data"), name);
        Path crawl = new Path(new Path(out, "crawl_parse"), name);
        final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "").split(" *, *");
        if (storeText) {
            MapFile.Writer.Option tKeyClassOpt = MapFile.Writer.keyClass(Text.class);
            SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
            SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable((Progressable)context);
            SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression((SequenceFile.CompressionType)SequenceFile.CompressionType.RECORD);
            textOut = new MapFile.Writer(conf, text, new SequenceFile.Writer.Option[]{tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt});
        } else {
            textOut = null;
        }
        MapFile.Writer.Option dKeyClassOpt = MapFile.Writer.keyClass(Text.class);
        SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
        SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable((Progressable)context);
        SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression((SequenceFile.CompressionType)compType);
        final MapFile.Writer dataOut = new MapFile.Writer(conf, data, new SequenceFile.Writer.Option[]{dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt});
        final SequenceFile.Writer crawlOut = SequenceFile.createWriter((Configuration)conf, (SequenceFile.Writer.Option[])new SequenceFile.Writer.Option[]{SequenceFile.Writer.file((Path)crawl), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(CrawlDatum.class), SequenceFile.Writer.bufferSize((int)fs.getConf().getInt("io.file.buffer.size", 4096)), SequenceFile.Writer.replication((short)fs.getDefaultReplication(crawl)), SequenceFile.Writer.blockSize((long)0x40000000L), SequenceFile.Writer.compression((SequenceFile.CompressionType)compType, (CompressionCodec)new DefaultCodec()), SequenceFile.Writer.progressable((Progressable)context), SequenceFile.Writer.metadata((SequenceFile.Metadata)new SequenceFile.Metadata())});
        return new RecordWriter<Text, Parse>(){

            /*
             * WARNING - void declaration
             */
            public void write(Text key, Parse parse) throws IOException {
                ParseStatus pstatus;
                byte[] signature;
                ParseData parseData;
                String sig;
                String fromUrl = key.toString();
                String origin = null;
                if (textOut != null) {
                    textOut.append((WritableComparable)key, (Writable)new ParseText(parse.getText()));
                }
                if ((sig = (parseData = parse.getData()).getContentMeta().get("nutch.content.digest")) != null && (signature = StringUtil.fromHexString(sig)) != null) {
                    CrawlDatum d = new CrawlDatum(65, 0);
                    d.setSignature(signature);
                    crawlOut.append((Writable)key, (Writable)d);
                }
                CrawlDatum parseMDCrawlDatum = null;
                for (String mdname : parseMDtoCrawlDB) {
                    String mdvalue = parse.getData().getParseMeta().get(mdname);
                    if (mdvalue == null) continue;
                    if (parseMDCrawlDatum == null) {
                        parseMDCrawlDatum = new CrawlDatum(68, 0);
                    }
                    parseMDCrawlDatum.getMetaData().put((Writable)new Text(mdname), (Writable)new Text(mdvalue));
                }
                if (parseMDCrawlDatum != null) {
                    crawlOut.append((Writable)key, parseMDCrawlDatum);
                }
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(fromUrl.toString());
                    origin = "bydomain".equalsIgnoreCase(ignoreExternalLinksMode) ? URLUtil.getDomainName(originURL).toLowerCase() : originURL.getHost().toLowerCase();
                }
                if ((pstatus = parseData.getStatus()) != null && pstatus.isSuccess() && pstatus.getMinorCode() == 100) {
                    String newUrl = pstatus.getMessage();
                    int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
                    if ((newUrl = ParseOutputFormat.filterNormalize(fromUrl, newUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, ParseOutputFormat.this.filters, ParseOutputFormat.this.exemptionFilters, ParseOutputFormat.this.normalizers, "fetcher")) != null) {
                        String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl, refreshTime < 5);
                        CrawlDatum newDatum = new CrawlDatum();
                        newDatum.setStatus(67);
                        if (reprUrl != null && !reprUrl.equals(newUrl)) {
                            newDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)new Text(reprUrl));
                        }
                        crawlOut.append((Writable)new Text(newUrl), (Writable)newDatum);
                    }
                }
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                int validCount = 0;
                CrawlDatum adjust = null;
                ArrayList<Map.Entry<Text, CrawlDatum>> targets = new ArrayList<Map.Entry<Text, CrawlDatum>>(outlinksToStore);
                ArrayList<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; ++i) {
                    void var16_21;
                    String string;
                    String string2 = links[i].getToUrl();
                    if (!isParsing && (string2.length() > maxOutlinkLength || (string = ParseOutputFormat.filterNormalize(fromUrl, string2, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, ParseOutputFormat.this.filters, ParseOutputFormat.this.exemptionFilters, ParseOutputFormat.this.normalizers)) == null)) continue;
                    CrawlDatum target = new CrawlDatum(67, interval);
                    Text targetUrl = new Text((String)var16_21);
                    MapWritable outlinkMD = links[i].getMetadata();
                    if (outlinkMD != null) {
                        target.getMetaData().putAll((Map)outlinkMD);
                    }
                    try {
                        ParseOutputFormat.this.scfilters.initialScore(targetUrl, target);
                    }
                    catch (ScoringFilterException e) {
                        LOG.warn("Cannot filter init score for url {}, using default: {}", (Object)key, (Object)e.getMessage());
                        target.setScore(0.0f);
                    }
                    targets.add(new SimpleEntry(targetUrl, target));
                    links[i].setUrl((String)var16_21);
                    outlinkList.add(links[i]);
                    ++validCount;
                }
                try {
                    adjust = ParseOutputFormat.this.scfilters.distributeScoreToOutlinks(key, parseData, targets, null, links.length);
                }
                catch (ScoringFilterException e) {
                    LOG.warn("Cannot distribute score from {}: {}", (Object)key, (Object)e.getMessage());
                }
                for (Map.Entry entry : targets) {
                    crawlOut.append((Writable)entry.getKey(), (Writable)entry.getValue());
                }
                if (adjust != null) {
                    crawlOut.append((Writable)key, (Writable)adjust);
                }
                Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
                parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
                dataOut.append((WritableComparable)key, (Writable)parseData);
                if (!parse.isCanonical()) {
                    CrawlDatum crawlDatum = new CrawlDatum();
                    crawlDatum.setStatus(33);
                    String timeString = parse.getData().getContentMeta().get("_ftk_");
                    try {
                        crawlDatum.setFetchTime(Long.parseLong(timeString));
                    }
                    catch (Exception e) {
                        LOG.warn("Can't read fetch time for: {}", (Object)key);
                        crawlDatum.setFetchTime(System.currentTimeMillis());
                    }
                    crawlOut.append((Writable)key, (Writable)crawlDatum);
                }
            }

            public void close(TaskAttemptContext context) throws IOException {
                if (textOut != null) {
                    textOut.close();
                }
                dataOut.close();
                crawlOut.close();
            }
        };
    }

    public static String filterNormalize(String fromUrl, String toUrl, String fromHost, boolean ignoreInternalLinks, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters exemptionFilters, URLNormalizers normalizers) {
        return ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, "outlink");
    }

    public static String filterNormalize(String fromUrl, String toUrl, String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters exemptionFilters, URLNormalizers normalizers, String urlNormalizerScope) {
        if (fromUrl.equals(toUrl)) {
            return null;
        }
        if (ignoreExternalLinks || ignoreInternalLinks) {
            String toHost;
            String toDomain;
            URL targetURL = null;
            try {
                targetURL = new URL(toUrl);
            }
            catch (MalformedURLException e1) {
                return null;
            }
            if (ignoreExternalLinks && ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode) ? (toDomain = URLUtil.getDomainName(targetURL).toLowerCase()) == null || !toDomain.equals(origin) : !(toHost = targetURL.getHost().toLowerCase()).equals(origin) && (exemptionFilters == null || !exemptionFilters.isExempted(fromUrl, toUrl)))) {
                return null;
            }
            if (ignoreInternalLinks && ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode) ? (toDomain = URLUtil.getDomainName(targetURL).toLowerCase()) == null || toDomain.equals(origin) : (toHost = targetURL.getHost().toLowerCase()) == null || toHost.equals(origin))) {
                return null;
            }
        }
        try {
            if (normalizers != null) {
                toUrl = normalizers.normalize(toUrl, urlNormalizerScope);
            }
            if (filters != null) {
                toUrl = filters.filter(toUrl);
            }
            if (toUrl == null) {
                return null;
            }
        }
        catch (Exception e) {
            return null;
        }
        return toUrl;
    }

    static {
        NUMBER_FORMAT.setMinimumIntegerDigits(5);
        NUMBER_FORMAT.setGroupingUsed(false);
    }

    private static class SimpleEntry
    implements Map.Entry<Text, CrawlDatum> {
        private Text key;
        private CrawlDatum value;

        public SimpleEntry(Text key, CrawlDatum value) {
            this.key = key;
            this.value = value;
        }

        @Override
        public Text getKey() {
            return this.key;
        }

        @Override
        public CrawlDatum getValue() {
            return this.value;
        }

        @Override
        public CrawlDatum setValue(CrawlDatum value) {
            this.value = value;
            return this.value;
        }
    }
}

