LeetCode（多线程）- 1242. 多线程网页爬虫-CFANZ编程社区

题目链接：点击打开链接
题目大意：略。
解题思路：略。
相关企业
Databricks
AC 代码
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser {
 *     public List<String> getUrls(String url) {}
 * }
 */
class Solution {
    // 已知URL集合，存储当前可见的所有URL。
    private ConcurrentHashMap<String, Boolean> totalUrls = new ConcurrentHashMap<>();
    
    // 结果URL链表及对应锁。
    private ReentrantLock resultLock = new ReentrantLock();
    private LinkedList<String> resultUrls = new LinkedList<>();

    // 待抓取URL链表及对应锁。
    private ReentrantLock crawlLock = new ReentrantLock();
    private LinkedList<String> urlsToCrawl = new LinkedList<>();

    // 当前正在执行的工作线程个数。
    private AtomicInteger choreCount = new AtomicInteger(0);
    
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        String hostName = extractHostName(startUrl);

        this.totalUrls.put(startUrl, true);

        addUrlToResult(startUrl); 
        addUrlToCrawl(startUrl);

        while (true) {
            String urlToCrawl = fetchUrlToCrawl();
            if (urlToCrawl != null) {
                incrChore();
                Chore chore = new Chore(this, hostName, htmlParser, urlToCrawl);
                (new Thread(chore)).start(); 
            } else {
                if (this.choreCount.get() == 0) {
                    break;
                }
                LockSupport.parkNanos(1L);
            }
        }

        return fetchResultUrls();
    }

    private String extractHostName(String url) {
        // HTTP protocol only.
        String processedUrl = url.substring(7);
        
        int index = processedUrl.indexOf("/");
        if (index == -1) {
            return processedUrl;
        } else {
            return processedUrl.substring(0, index);
        }
    }

    private class Chore implements Runnable {
        private Solution solution;
        private String hostName;
        private HtmlParser htmlParser; 
        private String urlToCrawl;

        public Chore(Solution solution, String hostName, HtmlParser htmlParser, String urlToCrawl) {
            this.solution = solution;
            this.hostName = hostName;
            this.htmlParser = htmlParser;
            this.urlToCrawl = urlToCrawl;
        }

        @Override
        public void run() {
            try {
                filterUrls(this.htmlParser.getUrls(urlToCrawl));
            } finally {
                this.solution.decrChore(); 
            }
        }

        private void filterUrls(List<String> crawledUrls) {
            if (crawledUrls == null || crawledUrls.isEmpty()) {
                return;
            }

            for (String url : crawledUrls) {
                // 如果该URL在已知的URL集合中已存在，那么不需要再重复抓取。
                if (this.solution.totalUrls.containsKey(url)) {
                    continue;
                }

                this.solution.totalUrls.put(url, true);
            
                String crawlHostName = this.solution.extractHostName(url);
                if (!crawlHostName.equals(this.hostName)) {
                    // 如果抓取的URL对应的HostName同Start URL对应的HostName不同，那么直接丢弃该URL。
                    continue;
                }
                
                // 将该URL添加至结果链表。
                this.solution.addUrlToResult(url);
                // 将该URL添加至待抓取链表，以便进行下一跳抓取。
                this.solution.addUrlToCrawl(url);
            }
        }
    }

    private void addUrlToResult(String url) {
        this.resultLock.lock();
        try {
            this.resultUrls.add(url);
        } finally {
            this.resultLock.unlock();
        }
    }

    private List<String> fetchResultUrls() {
        this.resultLock.lock();
        try {
            return this.resultUrls;
        } finally {
            this.resultLock.unlock();
        }
    }

    private void addUrlToCrawl(String url) {
        this.crawlLock.lock();
        try {
            this.urlsToCrawl.add(url);
        } finally {
            this.crawlLock.unlock();
        }
    }
    
    private String fetchUrlToCrawl() {
        this.crawlLock.lock();
        try {
            return this.urlsToCrawl.poll();
        } finally {
            this.crawlLock.unlock();
        }
    }

    private void incrChore() {
        this.choreCount.incrementAndGet();
    }

    private void decrChore() {
        this.choreCount.decrementAndGet();
    }
}