-
xml依赖
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>3.0.5</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.41</version>
</dependency>
<!--Jsoup解析网页-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
-
实体类
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Content {
@ExcelProperty("商品名称")
private String name;
@ExcelProperty("商品价格")
private String price;
@ExcelProperty("商品图片路径")
private String img;
}
-
写表工具类
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws Exception {
String fileName = "D:\\IDEA\\Jsoup\\parseJD.xlsx";
EasyExcel.write(fileName, Content.class)
.sheet("Jsoup")
.doWrite(new HtmlParseUtil().parseJD("java"));
}
public List<Content> parseJD(String keyword) throws Exception {
String url = "https://search.jd.com/Search?keyword=" + keyword;
ArrayList<Content> contents = new ArrayList<>();
Document document = Jsoup.parse(new URL(url), 300000);
Element element = document.getElementById("J_goodsList");
Elements li = element.getElementsByTag("li");
for (Element el : li) {
String name = el.getElementsByClass("p-name").eq(0).text();
String price = el.getElementsByClass("p-price").eq(0).text();
String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
Content content = new Content();
content.setName(name);
content.setPrice(price);
content.setImg(img);
contents.add(content);
}
return contents;
}
}