0
点赞
收藏
分享

微信扫一扫

LeetCode(多线程)- 1242. 多线程网页爬虫


题目链接:​​点击打开链接​​

题目大意:略。

解题思路:略。

相关企业

  • Databricks

AC 代码

/**
* // This is the HtmlParser's API interface.
* // You should not implement it, or speculate about its implementation
* interface HtmlParser {
* public List<String> getUrls(String url) {}
* }
*/
class Solution {
// 已知URL集合,存储当前可见的所有URL。
private ConcurrentHashMap<String, Boolean> totalUrls = new ConcurrentHashMap<>();

// 结果URL链表及对应锁。
private ReentrantLock resultLock = new ReentrantLock();
private LinkedList<String> resultUrls = new LinkedList<>();

// 待抓取URL链表及对应锁。
private ReentrantLock crawlLock = new ReentrantLock();
private LinkedList<String> urlsToCrawl = new LinkedList<>();

// 当前正在执行的工作线程个数。
private AtomicInteger choreCount = new AtomicInteger(0);

public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String hostName = extractHostName(startUrl);

this.totalUrls.put(startUrl, true);

addUrlToResult(startUrl);
addUrlToCrawl(startUrl);

while (true) {
String urlToCrawl = fetchUrlToCrawl();
if (urlToCrawl != null) {
incrChore();
Chore chore = new Chore(this, hostName, htmlParser, urlToCrawl);
(new Thread(chore)).start();
} else {
if (this.choreCount.get() == 0) {
break;
}
LockSupport.parkNanos(1L);
}
}

return fetchResultUrls();
}

private String extractHostName(String url) {
// HTTP protocol only.
String processedUrl = url.substring(7);

int index = processedUrl.indexOf("/");
if (index == -1) {
return processedUrl;
} else {
return processedUrl.substring(0, index);
}
}

private class Chore implements Runnable {
private Solution solution;
private String hostName;
private HtmlParser htmlParser;
private String urlToCrawl;

public Chore(Solution solution, String hostName, HtmlParser htmlParser, String urlToCrawl) {
this.solution = solution;
this.hostName = hostName;
this.htmlParser = htmlParser;
this.urlToCrawl = urlToCrawl;
}

@Override
public void run() {
try {
filterUrls(this.htmlParser.getUrls(urlToCrawl));
} finally {
this.solution.decrChore();
}
}

private void filterUrls(List<String> crawledUrls) {
if (crawledUrls == null || crawledUrls.isEmpty()) {
return;
}

for (String url : crawledUrls) {
// 如果该URL在已知的URL集合中已存在,那么不需要再重复抓取。
if (this.solution.totalUrls.containsKey(url)) {
continue;
}

this.solution.totalUrls.put(url, true);

String crawlHostName = this.solution.extractHostName(url);
if (!crawlHostName.equals(this.hostName)) {
// 如果抓取的URL对应的HostName同Start URL对应的HostName不同,那么直接丢弃该URL。
continue;
}

// 将该URL添加至结果链表。
this.solution.addUrlToResult(url);
// 将该URL添加至待抓取链表,以便进行下一跳抓取。
this.solution.addUrlToCrawl(url);
}
}
}

private void addUrlToResult(String url) {
this.resultLock.lock();
try {
this.resultUrls.add(url);
} finally {
this.resultLock.unlock();
}
}

private List<String> fetchResultUrls() {
this.resultLock.lock();
try {
return this.resultUrls;
} finally {
this.resultLock.unlock();
}
}

private void addUrlToCrawl(String url) {
this.crawlLock.lock();
try {
this.urlsToCrawl.add(url);
} finally {
this.crawlLock.unlock();
}
}

private String fetchUrlToCrawl() {
this.crawlLock.lock();
try {
return this.urlsToCrawl.poll();
} finally {
this.crawlLock.unlock();
}
}

private void incrChore() {
this.choreCount.incrementAndGet();
}

private void decrChore() {
this.choreCount.decrementAndGet();
}
}


举报

相关推荐

0 条评论