题目链接:点击打开链接
题目大意:略。
解题思路:略。
相关企业
- Databricks
AC 代码
/**
* // This is the HtmlParser's API interface.
* // You should not implement it, or speculate about its implementation
* interface HtmlParser {
* public List<String> getUrls(String url) {}
* }
*/
class Solution {
// 已知URL集合,存储当前可见的所有URL。
private ConcurrentHashMap<String, Boolean> totalUrls = new ConcurrentHashMap<>();
// 结果URL链表及对应锁。
private ReentrantLock resultLock = new ReentrantLock();
private LinkedList<String> resultUrls = new LinkedList<>();
// 待抓取URL链表及对应锁。
private ReentrantLock crawlLock = new ReentrantLock();
private LinkedList<String> urlsToCrawl = new LinkedList<>();
// 当前正在执行的工作线程个数。
private AtomicInteger choreCount = new AtomicInteger(0);
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String hostName = extractHostName(startUrl);
this.totalUrls.put(startUrl, true);
addUrlToResult(startUrl);
addUrlToCrawl(startUrl);
while (true) {
String urlToCrawl = fetchUrlToCrawl();
if (urlToCrawl != null) {
incrChore();
Chore chore = new Chore(this, hostName, htmlParser, urlToCrawl);
(new Thread(chore)).start();
} else {
if (this.choreCount.get() == 0) {
break;
}
LockSupport.parkNanos(1L);
}
}
return fetchResultUrls();
}
private String extractHostName(String url) {
// HTTP protocol only.
String processedUrl = url.substring(7);
int index = processedUrl.indexOf("/");
if (index == -1) {
return processedUrl;
} else {
return processedUrl.substring(0, index);
}
}
private class Chore implements Runnable {
private Solution solution;
private String hostName;
private HtmlParser htmlParser;
private String urlToCrawl;
public Chore(Solution solution, String hostName, HtmlParser htmlParser, String urlToCrawl) {
this.solution = solution;
this.hostName = hostName;
this.htmlParser = htmlParser;
this.urlToCrawl = urlToCrawl;
}
public void run() {
try {
filterUrls(this.htmlParser.getUrls(urlToCrawl));
} finally {
this.solution.decrChore();
}
}
private void filterUrls(List<String> crawledUrls) {
if (crawledUrls == null || crawledUrls.isEmpty()) {
return;
}
for (String url : crawledUrls) {
// 如果该URL在已知的URL集合中已存在,那么不需要再重复抓取。
if (this.solution.totalUrls.containsKey(url)) {
continue;
}
this.solution.totalUrls.put(url, true);
String crawlHostName = this.solution.extractHostName(url);
if (!crawlHostName.equals(this.hostName)) {
// 如果抓取的URL对应的HostName同Start URL对应的HostName不同,那么直接丢弃该URL。
continue;
}
// 将该URL添加至结果链表。
this.solution.addUrlToResult(url);
// 将该URL添加至待抓取链表,以便进行下一跳抓取。
this.solution.addUrlToCrawl(url);
}
}
}
private void addUrlToResult(String url) {
this.resultLock.lock();
try {
this.resultUrls.add(url);
} finally {
this.resultLock.unlock();
}
}
private List<String> fetchResultUrls() {
this.resultLock.lock();
try {
return this.resultUrls;
} finally {
this.resultLock.unlock();
}
}
private void addUrlToCrawl(String url) {
this.crawlLock.lock();
try {
this.urlsToCrawl.add(url);
} finally {
this.crawlLock.unlock();
}
}
private String fetchUrlToCrawl() {
this.crawlLock.lock();
try {
return this.urlsToCrawl.poll();
} finally {
this.crawlLock.unlock();
}
}
private void incrChore() {
this.choreCount.incrementAndGet();
}
private void decrChore() {
this.choreCount.decrementAndGet();
}
}