Jsoup 相關

Posted by Adam on August 24, 2022
### [Parsing HTML in Java with Jsoup](https://www.baeldung.com/java-with-jsoup) ```xml <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency> ``` ```java Document doc = Jsoup.connect("http://example.com").get(); doc.select("p").forEach(System.out::println); String blogUrl = "https://spring.io/blog"; Document doc = Jsoup.connect(blogUrl).get(); Connection connection = Jsoup.connect(blogUrl); connection.userAgent("Mozilla"); connection.timeout(5000); connection.cookie("cookiename", "val234"); connection.cookie("cookiename", "val234"); connection.referrer("http://google.com"); connection.header("headersecurity", "xyz123"); Document docCustomConn = connection.get(); Document docCustomConn = Jsoup.connect(blogUrl) .userAgent("Mozilla") .timeout(5000) .cookie("cookiename", "val234") .cookie("anothercookie", "ilovejsoup") .referrer("http://google.com") .header("headersecurity", "xyz123") .get(); Elements links = doc.select("a"); Elements sections = doc.select("section"); Elements logo = doc.select(".spring-logo--container"); Elements pagination = doc.select("#pagination_control"); Elements divsDescendant = doc.select("header div"); Elements divsDirect = doc.select("header > div"); Element pag = doc.getElementById("pagination_control"); Elements desktopOnly = doc.getElementsByClass("desktopOnly"); Element firstArticle = doc.select("article").first(); Element timeElement = firstArticle.select("time").first(); String dateTimeOfFirstArticle = timeElement.attr("datetime"); Element sectionDiv = firstArticle.select("section div").first(); String sectionDivText = sectionDiv.text(); String articleHtml = firstArticle.html(); String outerHtml = firstArticle.outerHtml(); doc.select("li.navbar-link").remove(); firstArticle.select("img").remove(); String docHtml = doc.html(); ```