{"id":1162508,"date":"2025-01-15T14:44:17","date_gmt":"2025-01-15T06:44:17","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/1162508.html"},"modified":"2025-01-15T14:44:19","modified_gmt":"2025-01-15T06:44:19","slug":"python%e5%a6%82%e4%bd%95%e5%a4%84%e7%90%86html%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/1162508.html","title":{"rendered":"python\u5982\u4f55\u5904\u7406html\u6570\u636e"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-kb.worktile.com\/kb\/wp-content\/uploads\/2024\/04\/25203503\/a21134c0-e84f-4b8b-b8f9-a4e18d2ceb55.webp\" alt=\"python\u5982\u4f55\u5904\u7406html\u6570\u636e\" \/><\/p>\n<p><p> <strong>Python\u5904\u7406HTML\u6570\u636e\u7684\u65b9\u6cd5\u6709\u5f88\u591a\u79cd\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u8fdb\u884c\u89e3\u6790\u3001\u5229\u7528requests\u5e93\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\u3001\u4f7f\u7528lxml\u8fdb\u884c\u89e3\u6790\u3001\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u3001\u4f7f\u7528Selenium\u6a21\u62df\u6d4f\u89c8\u5668\u64cd\u4f5c\u7b49\u3002<\/strong> \u5176\u4e2d\uff0c\u4f7f\u7528BeautifulSoup\u89e3\u6790HTML\u6570\u636e\u662f\u4e00\u79cd\u5e38\u89c1\u800c\u7b80\u4fbf\u7684\u65b9\u6cd5\u3002BeautifulSoup\u662f\u4e00\u4e2a\u53ef\u4ee5\u65b9\u4fbf\u5730\u4ece\u7f51\u9875\u4e2d\u63d0\u53d6\u6570\u636e\u7684\u5e93\uff0c\u5b83\u63d0\u4f9b\u4e86\u7b80\u5355\u7684API\u6765\u5bfc\u822a\u3001\u641c\u7d22\u548c\u4fee\u6539\u89e3\u6790\u6811\u3002\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u5982\u4f55\u4f7f\u7528BeautifulSoup\u6765\u5904\u7406HTML\u6570\u636e\u3002<\/p>\n<\/p>\n<p><h3>\u4e00\u3001\u5b89\u88c5\u548c\u5bfc\u5165\u6240\u9700\u5e93<\/h3>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u6211\u4eec\u9700\u8981\u5b89\u88c5BeautifulSoup\u548crequests\u5e93\u3002\u53ef\u4ee5\u4f7f\u7528pip\u547d\u4ee4\u6765\u5b89\u88c5\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install beautifulsoup4<\/p>\n<p>pip install requests<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u5b89\u88c5\u5b8c\u6210\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u5728\u4ee3\u7801\u4e2d\u5bfc\u5165\u8fd9\u4e9b\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from bs4 import BeautifulSoup<\/p>\n<p>import requests<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u3001\u83b7\u53d6\u7f51\u9875\u5185\u5bb9<\/h3>\n<\/p>\n<p><p>\u5728\u5904\u7406HTML\u6570\u636e\u4e4b\u524d\uff0c\u6211\u4eec\u9700\u8981\u5148\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\u3002\u53ef\u4ee5\u4f7f\u7528requests\u5e93\u6765\u53d1\u9001HTTP\u8bf7\u6c42\u5e76\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">url = &#39;http:\/\/example.com&#39;<\/p>\n<p>response = requests.get(url)<\/p>\n<p>html_content = response.content<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u8fd9\u91cc\u7684<code>url<\/code>\u662f\u6211\u4eec\u8981\u5904\u7406\u7684\u7f51\u9875\u5730\u5740\uff0c<code>response.content<\/code>\u83b7\u53d6\u5230\u7684\u662f\u7f51\u9875\u7684HTML\u5185\u5bb9\u3002<\/p>\n<\/p>\n<p><h3>\u4e09\u3001\u89e3\u6790HTML\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u6709\u4e86HTML\u5185\u5bb9\u4e4b\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u8fd9\u4e9b\u6570\u636e\u3002BeautifulSoup\u652f\u6301\u591a\u79cd\u89e3\u6790\u5668\uff0c\u5176\u4e2d\u6700\u5e38\u7528\u7684\u662flxml\u89e3\u6790\u5668\u548chtml.parser\u89e3\u6790\u5668\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u56db\u3001\u67e5\u627e\u548c\u63d0\u53d6\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u89e3\u6790\u5b8cHTML\u6570\u636e\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u63d0\u4f9b\u7684\u65b9\u6cd5\u6765\u67e5\u627e\u548c\u63d0\u53d6\u6211\u4eec\u9700\u8981\u7684\u6570\u636e\u3002<\/p>\n<\/p>\n<p><h4>1\u3001\u67e5\u627e\u5355\u4e2a\u5143\u7d20<\/h4>\n<\/p>\n<p><p>\u53ef\u4ee5\u4f7f\u7528<code>find<\/code>\u65b9\u6cd5\u67e5\u627e\u7b2c\u4e00\u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">title = soup.find(&#39;title&#39;)<\/p>\n<p>print(title.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2\u3001\u67e5\u627e\u591a\u4e2a\u5143\u7d20<\/h4>\n<\/p>\n<p><p>\u53ef\u4ee5\u4f7f\u7528<code>find_all<\/code>\u65b9\u6cd5\u67e5\u627e\u6240\u6709\u7b26\u5408\u6761\u4ef6\u7684\u5143\u7d20\uff0c\u8fd4\u56de\u4e00\u4e2a\u5217\u8868\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">links = soup.find_all(&#39;a&#39;)<\/p>\n<p>for link in links:<\/p>\n<p>    print(link[&#39;href&#39;])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3\u3001\u4f7f\u7528CSS\u9009\u62e9\u5668<\/h4>\n<\/p>\n<p><p>\u4e5f\u53ef\u4ee5\u4f7f\u7528CSS\u9009\u62e9\u5668\u6765\u67e5\u627e\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">header = soup.select_one(&#39;h1.header&#39;)<\/p>\n<p>print(header.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><pre><code class=\"language-python\">items = soup.select(&#39;.item&#39;)<\/p>\n<p>for item in items:<\/p>\n<p>    print(item.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e94\u3001\u4fee\u6539HTML\u6570\u636e<\/h3>\n<\/p>\n<p><p>BeautifulSoup\u4e0d\u4ec5\u53ef\u4ee5\u7528\u6765\u67e5\u627e\u548c\u63d0\u53d6\u6570\u636e\uff0c\u8fd8\u53ef\u4ee5\u7528\u6765\u4fee\u6539HTML\u6570\u636e\u3002\u4f8b\u5982\uff0c\u53ef\u4ee5\u4fee\u6539\u5143\u7d20\u7684\u5c5e\u6027\u3001\u5220\u9664\u5143\u7d20\u3001\u63d2\u5165\u65b0\u5143\u7d20\u7b49\u3002<\/p>\n<\/p>\n<p><h4>1\u3001\u4fee\u6539\u5143\u7d20\u7684\u5c5e\u6027<\/h4>\n<\/p>\n<p><pre><code class=\"language-python\">tag = soup.find(&#39;a&#39;)<\/p>\n<p>tag[&#39;href&#39;] = &#39;http:\/\/newurl.com&#39;<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2\u3001\u5220\u9664\u5143\u7d20<\/h4>\n<\/p>\n<p><pre><code class=\"language-python\">tag = soup.find(&#39;div&#39;, class_=&#39;ad&#39;)<\/p>\n<p>tag.decompose()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3\u3001\u63d2\u5165\u65b0\u5143\u7d20<\/h4>\n<\/p>\n<p><pre><code class=\"language-python\">new_tag = soup.new_tag(&#39;p&#39;)<\/p>\n<p>new_tag.string = &quot;This is a new paragraph.&quot;<\/p>\n<p>soup.body.append(new_tag)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516d\u3001\u4fdd\u5b58\u4fee\u6539\u540e\u7684HTML\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u4fee\u6539\u5b8cHTML\u6570\u636e\u540e\uff0c\u53ef\u4ee5\u5c06\u5176\u4fdd\u5b58\u5230\u6587\u4ef6\u4e2d\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">with open(&#39;modified.html&#39;, &#39;w&#39;, encoding=&#39;utf-8&#39;) as file:<\/p>\n<p>    file.write(str(soup))<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e03\u3001\u5904\u7406\u52a8\u6001\u7f51\u9875<\/h3>\n<\/p>\n<p><p>\u5bf9\u4e8e\u4e00\u4e9b\u52a8\u6001\u7f51\u9875\uff0c\u4f7f\u7528requests\u83b7\u53d6\u7684\u5185\u5bb9\u53ef\u80fd\u5e76\u4e0d\u5b8c\u6574\u3002\u8fd9\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528Selenium\u6765\u6a21\u62df\u6d4f\u89c8\u5668\u64cd\u4f5c\u5e76\u83b7\u53d6\u5b8c\u6574\u7684\u7f51\u9875\u5185\u5bb9\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u5b89\u88c5Selenium\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install selenium<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u4e0b\u8f7d\u5bf9\u5e94\u7684\u6d4f\u89c8\u5668\u9a71\u52a8\uff08\u5982ChromeDriver\uff09\u5e76\u5c06\u5176\u8def\u5f84\u6dfb\u52a0\u5230\u7cfb\u7edf\u73af\u5883\u53d8\u91cf\u4e2d\u3002<\/p>\n<\/p>\n<p><p>\u63a5\u4e0b\u6765\uff0c\u4f7f\u7528Selenium\u83b7\u53d6\u52a8\u6001\u7f51\u9875\u5185\u5bb9\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from selenium import webdriver<\/p>\n<h2><strong>\u8bbe\u7f6e\u6d4f\u89c8\u5668\u9009\u9879<\/strong><\/h2>\n<p>options = webdriver.ChromeOptions()<\/p>\n<p>options.add_argument(&#39;--headless&#39;)<\/p>\n<h2><strong>\u521b\u5efa\u6d4f\u89c8\u5668\u5bf9\u8c61<\/strong><\/h2>\n<p>browser = webdriver.Chrome(options=options)<\/p>\n<h2><strong>\u6253\u5f00\u76ee\u6807\u7f51\u9875<\/strong><\/h2>\n<p>url = &#39;http:\/\/example.com&#39;<\/p>\n<p>browser.get(url)<\/p>\n<h2><strong>\u83b7\u53d6\u7f51\u9875\u5185\u5bb9<\/strong><\/h2>\n<p>html_content = browser.page_source<\/p>\n<h2><strong>\u5173\u95ed\u6d4f\u89c8\u5668<\/strong><\/h2>\n<p>browser.quit()<\/p>\n<h2><strong>\u89e3\u6790HTML\u6570\u636e<\/strong><\/h2>\n<p>soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516b\u3001\u5904\u7406HTML\u8868\u683c\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5728\u5904\u7406\u7f51\u9875\u4e2d\u7684\u8868\u683c\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u8868\u683c\uff0c\u5e76\u5c06\u6570\u636e\u63d0\u53d6\u5230Pandas\u6570\u636e\u6846\u4e2d\u8fdb\u884c\u8fdb\u4e00\u6b65\u5206\u6790\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u5b89\u88c5Pandas\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install pandas<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u4f7f\u7528\u4ee5\u4e0b\u4ee3\u7801\u89e3\u6790\u8868\u683c\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u67e5\u627e\u8868\u683c<\/strong><\/h2>\n<p>table = soup.find(&#39;table&#39;)<\/p>\n<h2><strong>\u63d0\u53d6\u8868\u683c\u6570\u636e<\/strong><\/h2>\n<p>rows = table.find_all(&#39;tr&#39;)<\/p>\n<p>data = []<\/p>\n<p>for row in rows:<\/p>\n<p>    cols = row.find_all(&#39;td&#39;)<\/p>\n<p>    cols = [col.text.strip() for col in cols]<\/p>\n<p>    data.append(cols)<\/p>\n<h2><strong>\u5c06\u6570\u636e\u8f6c\u6362\u4e3aPandas\u6570\u636e\u6846<\/strong><\/h2>\n<p>df = pd.DataFrame(data)<\/p>\n<p>print(df)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e5d\u3001\u5904\u7406HTML\u8868\u5355\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u8868\u5355\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u8868\u5355\uff0c\u5e76\u4f7f\u7528requests\u5e93\u6765\u6a21\u62df\u8868\u5355\u63d0\u4ea4\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u8868\u5355\u53ca\u5176\u8f93\u5165\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">form = soup.find(&#39;form&#39;)<\/p>\n<p>inputs = form.find_all(&#39;input&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u6784\u9020\u8868\u5355\u6570\u636e\u5e76\u63d0\u4ea4\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">form_data = {}<\/p>\n<p>for input_element in inputs:<\/p>\n<p>    name = input_element.get(&#39;name&#39;)<\/p>\n<p>    value = input_element.get(&#39;value&#39;, &#39;&#39;)<\/p>\n<p>    form_data[name] = value<\/p>\n<h2><strong>\u63d0\u4ea4\u8868\u5355<\/strong><\/h2>\n<p>post_url = form.get(&#39;action&#39;)<\/p>\n<p>response = requests.post(post_url, data=form_data)<\/p>\n<p>print(response.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u3001\u5904\u7406HTML\u4e2d\u7684\u56fe\u7247\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u56fe\u7247\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u67e5\u627e\u56fe\u7247\u5143\u7d20\uff0c\u5e76\u4f7f\u7528requests\u5e93\u6765\u4e0b\u8f7d\u56fe\u7247\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u56fe\u7247\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">images = soup.find_all(&#39;img&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u4e0b\u8f7d\u56fe\u7247\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for img in images:<\/p>\n<p>    img_url = img.get(&#39;src&#39;)<\/p>\n<p>    img_response = requests.get(img_url)<\/p>\n<p>    img_name = img_url.split(&#39;\/&#39;)[-1]<\/p>\n<p>    with open(img_name, &#39;wb&#39;) as img_file:<\/p>\n<p>        img_file.write(img_response.content)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u4e00\u3001\u5904\u7406HTML\u4e2d\u7684\u94fe\u63a5\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u94fe\u63a5\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u67e5\u627e\u94fe\u63a5\u5143\u7d20\uff0c\u5e76\u63d0\u53d6\u94fe\u63a5\u5730\u5740\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u94fe\u63a5\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">links = soup.find_all(&#39;a&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6\u94fe\u63a5\u5730\u5740\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for link in links:<\/p>\n<p>    href = link.get(&#39;href&#39;)<\/p>\n<p>    print(href)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u4e8c\u3001\u5904\u7406HTML\u4e2d\u7684\u5d4c\u5957\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u5d4c\u5957\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u9010\u5c42\u89e3\u6790\u5d4c\u5957\u5143\u7d20\u3002<\/p>\n<\/p>\n<p><p>\u4f8b\u5982\uff0c\u5904\u7406\u5d4c\u5957\u5217\u8868\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">nested_list = soup.find(&#39;ul&#39;, class_=&#39;nested-list&#39;)<\/p>\n<p>items = nested_list.find_all(&#39;li&#39;)<\/p>\n<p>for item in items:<\/p>\n<p>    sub_items = item.find_all(&#39;li&#39;)<\/p>\n<p>    for sub_item in sub_items:<\/p>\n<p>        print(sub_item.text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u4e09\u3001\u5904\u7406HTML\u4e2d\u7684\u8868\u5355\u63d0\u4ea4\u540e\u8fd4\u56de\u7684\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5728\u5904\u7406\u8868\u5355\u63d0\u4ea4\u540e\u8fd4\u56de\u7684\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528requests\u5e93\u6765\u6a21\u62df\u8868\u5355\u63d0\u4ea4\uff0c\u5e76\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u8fd4\u56de\u7684HTML\u6570\u636e\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u8868\u5355\u53ca\u5176\u8f93\u5165\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">form = soup.find(&#39;form&#39;)<\/p>\n<p>inputs = form.find_all(&#39;input&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u6784\u9020\u8868\u5355\u6570\u636e\u5e76\u63d0\u4ea4\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">form_data = {}<\/p>\n<p>for input_element in inputs:<\/p>\n<p>    name = input_element.get(&#39;name&#39;)<\/p>\n<p>    value = input_element.get(&#39;value&#39;, &#39;&#39;)<\/p>\n<p>    form_data[name] = value<\/p>\n<h2><strong>\u63d0\u4ea4\u8868\u5355<\/strong><\/h2>\n<p>post_url = form.get(&#39;action&#39;)<\/p>\n<p>response = requests.post(post_url, data=form_data)<\/p>\n<p>returned_html = response.content<\/p>\n<h2><strong>\u89e3\u6790\u8fd4\u56de\u7684HTML\u6570\u636e<\/strong><\/h2>\n<p>returned_soup = BeautifulSoup(returned_html, &#39;lxml&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u56db\u3001\u5904\u7406HTML\u4e2d\u7684JavaScript\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684JavaScript\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u5d4c\u5165\u5728HTML\u4e2d\u7684JavaScript\u4ee3\u7801\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709script\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">scripts = soup.find_all(&#39;script&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6JavaScript\u4ee3\u7801\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for script in scripts:<\/p>\n<p>    js_code = script.string<\/p>\n<p>    print(js_code)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u4e94\u3001\u5904\u7406HTML\u4e2d\u7684CSS\u6837\u5f0f\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684CSS\u6837\u5f0f\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u5d4c\u5165\u5728HTML\u4e2d\u7684CSS\u4ee3\u7801\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709style\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">styles = soup.find_all(&#39;style&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6CSS\u4ee3\u7801\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for style in styles:<\/p>\n<p>    css_code = style.string<\/p>\n<p>    print(css_code)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u516d\u3001\u5904\u7406HTML\u4e2d\u7684\u8868\u5355\u5b57\u6bb5\u9a8c\u8bc1\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u8868\u5355\u5b57\u6bb5\u9a8c\u8bc1\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u8868\u5355\u4e2d\u7684\u9a8c\u8bc1\u89c4\u5219\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u8868\u5355\u53ca\u5176\u8f93\u5165\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">form = soup.find(&#39;form&#39;)<\/p>\n<p>inputs = form.find_all(&#39;input&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6\u9a8c\u8bc1\u89c4\u5219\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for input_element in inputs:<\/p>\n<p>    name = input_element.get(&#39;name&#39;)<\/p>\n<p>    required = input_element.get(&#39;required&#39;)<\/p>\n<p>    pattern = input_element.get(&#39;pattern&#39;)<\/p>\n<p>    print(f&#39;Name: {name}, Required: {required}, Pattern: {pattern}&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u4e03\u3001\u5904\u7406HTML\u4e2d\u7684\u591a\u5a92\u4f53\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u591a\u5a92\u4f53\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u67e5\u627e\u591a\u5a92\u4f53\u5143\u7d20\uff0c\u5e76\u4f7f\u7528requests\u5e93\u6765\u4e0b\u8f7d\u591a\u5a92\u4f53\u6587\u4ef6\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u97f3\u9891\u548c\u89c6\u9891\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">audios = soup.find_all(&#39;audio&#39;)<\/p>\n<p>videos = soup.find_all(&#39;video&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u4e0b\u8f7d\u591a\u5a92\u4f53\u6587\u4ef6\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for audio in audios:<\/p>\n<p>    audio_url = audio.get(&#39;src&#39;)<\/p>\n<p>    audio_response = requests.get(audio_url)<\/p>\n<p>    audio_name = audio_url.split(&#39;\/&#39;)[-1]<\/p>\n<p>    with open(audio_name, &#39;wb&#39;) as audio_file:<\/p>\n<p>        audio_file.write(audio_response.content)<\/p>\n<p>for video in videos:<\/p>\n<p>    video_url = video.get(&#39;src&#39;)<\/p>\n<p>    video_response = requests.get(video_url)<\/p>\n<p>    video_name = video_url.split(&#39;\/&#39;)[-1]<\/p>\n<p>    with open(video_name, &#39;wb&#39;) as video_file:<\/p>\n<p>        video_file.write(video_response.content)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u516b\u3001\u5904\u7406HTML\u4e2d\u7684iframe\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684iframe\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u67e5\u627eiframe\u5143\u7d20\uff0c\u5e76\u83b7\u53d6\u5176src\u5c5e\u6027\uff0c\u7136\u540e\u89e3\u6790\u5d4c\u5165\u7684HTML\u5185\u5bb9\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709iframe\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">iframes = soup.find_all(&#39;iframe&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u83b7\u53d6iframe\u7684src\u5c5e\u6027\u5e76\u89e3\u6790\u5d4c\u5165\u7684HTML\u5185\u5bb9\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for iframe in iframes:<\/p>\n<p>    iframe_url = iframe.get(&#39;src&#39;)<\/p>\n<p>    iframe_response = requests.get(iframe_url)<\/p>\n<p>    iframe_html = iframe_response.content<\/p>\n<p>    iframe_soup = BeautifulSoup(iframe_html, &#39;lxml&#39;)<\/p>\n<p>    print(iframe_soup.prettify())<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u5341\u4e5d\u3001\u5904\u7406HTML\u4e2d\u7684meta\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684meta\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u67e5\u627emeta\u5143\u7d20\uff0c\u5e76\u63d0\u53d6\u5176\u5c5e\u6027\u503c\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709meta\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">metas = soup.find_all(&#39;meta&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6meta\u5c5e\u6027\u503c\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for meta in metas:<\/p>\n<p>    name = meta.get(&#39;name&#39;)<\/p>\n<p>    content = meta.get(&#39;content&#39;)<\/p>\n<p>    print(f&#39;Name: {name}, Content: {content}&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u5341\u3001\u5904\u7406HTML\u4e2d\u7684\u7ed3\u6784\u5316\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u7ed3\u6784\u5316\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u7ed3\u6784\u5316\u6570\u636e\uff0c\u4f8b\u5982JSON-LD\u3001Microdata\u548cRDFa\u3002<\/p>\n<\/p>\n<p><h4>1\u3001\u89e3\u6790JSON-LD\u6570\u636e<\/h4>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709script\u5143\u7d20\u5e76\u8fc7\u6ee4\u51fa\u7c7b\u578b\u4e3aapplication\/ld+json\u7684\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">json_ld_scripts = soup.find_all(&#39;script&#39;, type=&#39;application\/ld+json&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u89e3\u6790JSON-LD\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import json<\/p>\n<p>for script in json_ld_scripts:<\/p>\n<p>    json_ld_data = json.loads(script.string)<\/p>\n<p>    print(json_ld_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2\u3001\u89e3\u6790Microdata<\/h4>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u5e26\u6709itemscope\u5c5e\u6027\u7684\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">microdata_items = soup.find_all(attrs={&quot;itemscope&quot;: True})<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6Microdata\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for item in microdata_items:<\/p>\n<p>    item_type = item.get(&#39;itemtype&#39;)<\/p>\n<p>    properties = item.find_all(attrs={&quot;itemprop&quot;: True})<\/p>\n<p>    item_data = {&quot;@type&quot;: item_type}<\/p>\n<p>    for prop in properties:<\/p>\n<p>        prop_name = prop.get(&#39;itemprop&#39;)<\/p>\n<p>        prop_value = prop.text.strip()<\/p>\n<p>        item_data[prop_name] = prop_value<\/p>\n<p>    print(item_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3\u3001\u89e3\u6790RDFa\u6570\u636e<\/h4>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u5e26\u6709typeof\u5c5e\u6027\u7684\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">rdfa_items = soup.find_all(attrs={&quot;typeof&quot;: True})<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u63d0\u53d6RDFa\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for item in rdfa_items:<\/p>\n<p>    item_type = item.get(&#39;typeof&#39;)<\/p>\n<p>    properties = item.find_all(attrs={&quot;property&quot;: True})<\/p>\n<p>    item_data = {&quot;@type&quot;: item_type}<\/p>\n<p>    for prop in properties:<\/p>\n<p>        prop_name = prop.get(&#39;property&#39;)<\/p>\n<p>        prop_value = prop.text.strip()<\/p>\n<p>        item_data[prop_name] = prop_value<\/p>\n<p>    print(item_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u5341\u4e00\u3001\u5904\u7406HTML\u4e2d\u7684\u5d4c\u5957\u8868\u683c\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u5d4c\u5957\u8868\u683c\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u9010\u5c42\u89e3\u6790\u5d4c\u5957\u7684\u8868\u683c\u5143\u7d20\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u8868\u683c\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">tables = soup.find_all(&#39;table&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u9012\u5f52\u89e3\u6790\u5d4c\u5957\u7684\u8868\u683c\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">def parse_table(table):<\/p>\n<p>    rows = table.find_all(&#39;tr&#39;)<\/p>\n<p>    data = []<\/p>\n<p>    for row in rows:<\/p>\n<p>        cols = row.find_all(&#39;td&#39;)<\/p>\n<p>        cols_data = []<\/p>\n<p>        for col in cols:<\/p>\n<p>            if col.find(&#39;table&#39;):<\/p>\n<p>                cols_data.append(parse_table(col.find(&#39;table&#39;)))<\/p>\n<p>            else:<\/p>\n<p>                cols_data.append(col.text.strip())<\/p>\n<p>        data.append(cols_data)<\/p>\n<p>    return data<\/p>\n<p>for table in tables:<\/p>\n<p>    table_data = parse_table(table)<\/p>\n<p>    print(table_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u5341\u4e8c\u3001\u5904\u7406HTML\u4e2d\u7684\u6570\u636e\u8868\u683c\u5206\u9875<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u6570\u636e\u8868\u683c\u5206\u9875\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u5206\u9875\u6570\u636e\uff0c\u5e76\u4f7f\u7528requests\u5e93\u6765\u83b7\u53d6\u4e0b\u4e00\u9875\u7684\u6570\u636e\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u5206\u9875\u94fe\u63a5\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">pagination_links = soup.find_all(&#39;a&#39;, class_=&#39;pagination-link&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u904d\u5386\u5206\u9875\u94fe\u63a5\u5e76\u83b7\u53d6\u6bcf\u9875\u7684\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">for link in pagination_links:<\/p>\n<p>    page_url = link.get(&#39;href&#39;)<\/p>\n<p>    page_response = requests.get(page_url)<\/p>\n<p>    page_html = page_response.content<\/p>\n<p>    page_soup = BeautifulSoup(page_html, &#39;lxml&#39;)<\/p>\n<p>    table = page_soup.find(&#39;table&#39;)<\/p>\n<p>    table_data = parse_table(table)<\/p>\n<p>    print(table_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u5341\u4e09\u3001\u5904\u7406HTML\u4e2d\u7684\u52a8\u6001\u52a0\u8f7d\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u52a8\u6001\u52a0\u8f7d\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528Selenium\u6765\u6a21\u62df\u6d4f\u89c8\u5668\u64cd\u4f5c\u5e76\u83b7\u53d6\u52a8\u6001\u52a0\u8f7d\u7684\u7f51\u9875\u5185\u5bb9\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u8bbe\u7f6eSelenium\u6d4f\u89c8\u5668\u9009\u9879\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from selenium import webdriver<\/p>\n<p>options = webdriver.ChromeOptions()<\/p>\n<p>options.add_argument(&#39;--headless&#39;)<\/p>\n<p>browser = webdriver.Chrome(options=options)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u6253\u5f00\u76ee\u6807\u7f51\u9875\u5e76\u83b7\u53d6\u52a8\u6001\u52a0\u8f7d\u7684\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">url = &#39;http:\/\/example.com&#39;<\/p>\n<p>browser.get(url)<\/p>\n<h2><strong>\u7b49\u5f85\u52a8\u6001\u52a0\u8f7d\u5b8c\u6210<\/strong><\/h2>\n<p>import time<\/p>\n<p>time.sleep(5)<\/p>\n<h2><strong>\u83b7\u53d6\u52a8\u6001\u52a0\u8f7d\u540e\u7684\u7f51\u9875\u5185\u5bb9<\/strong><\/h2>\n<p>html_content = browser.page_source<\/p>\n<p>browser.quit()<\/p>\n<h2><strong>\u89e3\u6790HTML\u6570\u636e<\/strong><\/h2>\n<p>soup = BeautifulSoup(html_content, &#39;lxml&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u5341\u56db\u3001\u5904\u7406HTML\u4e2d\u7684\u8868\u683c\u5408\u5e76\u5355\u5143\u683c\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u8868\u683c\u5408\u5e76\u5355\u5143\u683c\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u5408\u5e76\u5355\u5143\u683c\u7684\u6570\u636e\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u8868\u683c\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">tables = soup.find_all(&#39;table&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u89e3\u6790\u5408\u5e76\u5355\u5143\u683c\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">def parse_table_with_merged_cells(table):<\/p>\n<p>    rows = table.find_all(&#39;tr&#39;)<\/p>\n<p>    data = []<\/p>\n<p>    for row in rows:<\/p>\n<p>        cols = row.find_all([&#39;td&#39;, &#39;th&#39;])<\/p>\n<p>        cols_data = []<\/p>\n<p>        for col in cols:<\/p>\n<p>            rowspan = int(col.get(&#39;rowspan&#39;, 1))<\/p>\n<p>            colspan = int(col.get(&#39;colspan&#39;, 1))<\/p>\n<p>            cell_data = col.text.strip()<\/p>\n<p>            for _ in range(rowspan):<\/p>\n<p>                cols_data.append([cell_data] * colspan)<\/p>\n<p>        data.append(cols_data)<\/p>\n<p>    return data<\/p>\n<p>for table in tables:<\/p>\n<p>    table_data = parse_table_with_merged_cells(table)<\/p>\n<p>    print(table_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u5341\u4e94\u3001\u5904\u7406HTML\u4e2d\u7684\u590d\u6742\u8868\u683c\u6570\u636e<\/h3>\n<\/p>\n<p><p>\u5904\u7406HTML\u4e2d\u7684\u590d\u6742\u8868\u683c\u6570\u636e\u65f6\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u6765\u89e3\u6790\u590d\u6742\u7684\u8868\u683c\u7ed3\u6784\u3002<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u627e\u5230\u6240\u6709\u8868\u683c\u5143\u7d20\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">tables = soup.find_all(&#39;table&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7136\u540e\uff0c\u89e3\u6790\u590d\u6742\u8868\u683c\u6570\u636e\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">def parse_complex_table(table):<\/p>\n<p>    rows = table.find_all(&#39;tr&#39;)<\/p>\n<p>    data = []<\/p>\n<p>    for row in rows:<\/p>\n<p>        cols = row.find_all([&#39;td&#39;, &#39;th&#39;])<\/p>\n<p>        cols_data = []<\/p>\n<p>        for col in cols:<\/p>\n<p>            if col.find(&#39;table&#39;):<\/p>\n<p>                cols_data.append(parse_complex_table(col.find(&#39;table&#39;)))<\/p>\n<p>            else:<\/p>\n<p>                rowspan = int(col.get(&#39;rowspan&#39;, 1))<\/p>\n<p>                colspan = int(col.get(&#39;colspan&#39;, 1))<\/p>\n<p>                cell_data = col.text.strip()<\/p>\n<p>                for _ in range(rowspan):<\/p>\n<p>                    cols_data.append([cell_data] * colspan)<\/p>\n<p>        data.append(cols_data)<\/p>\n<p>    return data<\/p>\n<p>for table in tables:<\/p>\n<p>    table_data = parse_complex_table(table)<\/p>\n<p>    print(table_data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u901a\u8fc7\u4ee5\u4e0a\u7684\u65b9\u6cd5\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528Python\u6765\u5904\u7406\u5404\u79cdHTML\u6570\u636e\uff0c\u5305\u62ec\u89e3\u6790\u3001\u4fee\u6539\u3001\u63d0\u53d6\u3001\u4fdd\u5b58\u548c\u5904\u7406\u52a8\u6001\u52a0\u8f7d\u7684\u6570\u636e\u3002BeautifulSoup\u548crequests\u5e93\u662f\u5904\u7406HTML\u6570\u636e\u7684\u5e38\u7528\u5de5\u5177\uff0c\u800cSelenium\u5219\u53ef\u4ee5\u5e2e\u52a9\u6211\u4eec\u5904\u7406\u52a8\u6001\u7f51\u9875\u3002\u5e0c\u671b\u8fd9\u4e9b\u65b9\u6cd5\u80fd\u591f\u5e2e\u52a9\u60a8\u66f4\u597d\u5730\u5904\u7406\u548c\u5206\u6790HTML\u6570\u636e\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u4f7f\u7528Python\u4eceHTML\u4e2d\u63d0\u53d6\u7279\u5b9a\u6570\u636e\uff1f<\/strong><br \/>\u5728Python\u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528Beautiful Soup\u5e93\u6765\u89e3\u6790HTML\u6570\u636e\u3002\u901a\u8fc7\u5c06HTML\u6587\u6863\u52a0\u8f7d\u5230Beautiful Soup\u5bf9\u8c61\u4e2d\uff0c\u60a8\u53ef\u4ee5\u4f7f\u7528\u5404\u79cd\u65b9\u6cd5\u627e\u5230\u7279\u5b9a\u7684\u6807\u7b7e\u3001\u7c7b\u6216ID\uff0c\u5e76\u63d0\u53d6\u6240\u9700\u7684\u4fe1\u606f\u3002\u7ed3\u5408requests\u5e93\uff0c\u60a8\u53ef\u4ee5\u76f4\u63a5\u4ece\u7f51\u9875\u6293\u53d6\u6570\u636e\uff0c\u793a\u4f8b\u4ee3\u7801\u5982\u4e0b\uff1a  <\/p>\n<pre><code class=\"language-python\">import requests\nfrom bs4 import BeautifulSoup\n\nurl = &#39;https:\/\/example.com&#39;\nresponse = requests.get(url)\nsoup = BeautifulSoup(response.text, &#39;html.parser&#39;)\n\n# \u63d0\u53d6\u7279\u5b9a\u6570\u636e\ndata = soup.find_all(&#39;h1&#39;)  # \u63d0\u53d6\u6240\u6709h1\u6807\u7b7e\u5185\u5bb9\nfor item in data:\n    print(item.text)\n<\/code><\/pre>\n<p><strong>Python\u4e2d\u89e3\u6790HTML\u7684\u6700\u4f73\u5e93\u6709\u54ea\u4e9b\uff1f<\/strong><br \/>\u5728Python\u4e2d\uff0c\u6709\u51e0\u4e2a\u6d41\u884c\u7684\u5e93\u53ef\u4ee5\u89e3\u6790HTML\u6570\u636e\u3002Beautiful Soup\u662f\u6700\u5e38\u7528\u7684\u9009\u62e9\uff0c\u6613\u4e8e\u4f7f\u7528\u4e14\u529f\u80fd\u5f3a\u5927\u3002lxml\u5219\u4ee5\u5176\u5feb\u901f\u548c\u9ad8\u6548\u800c\u95fb\u540d\uff0c\u9002\u5408\u5904\u7406\u5927\u578b\u6587\u6863\u3002\u6b64\u5916\uff0chtml5lib\u53ef\u4ee5\u5904\u7406\u66f4\u590d\u6742\u7684HTML\u7ed3\u6784\uff0c\u9002\u5408\u9700\u8981\u517c\u5bb9\u6027\u548c\u7075\u6d3b\u6027\u7684\u9879\u76ee\u3002\u9009\u62e9\u5408\u9002\u7684\u5e93\u53d6\u51b3\u4e8e\u9879\u76ee\u7684\u9700\u6c42\u548c\u590d\u6742\u6027\u3002<\/p>\n<p><strong>\u5982\u4f55\u5904\u7406\u590d\u6742\u7684HTML\u7ed3\u6784\uff1f<\/strong><br \/>\u5904\u7406\u590d\u6742\u7684HTML\u7ed3\u6784\u65f6\uff0c\u53ef\u4ee5\u7ed3\u5408\u4f7f\u7528Beautiful Soup\u548c\u6b63\u5219\u8868\u8fbe\u5f0f\u3002Beautiful Soup\u53ef\u4ee5\u5e2e\u52a9\u60a8\u627e\u5230\u7279\u5b9a\u7684\u6807\u7b7e\uff0c\u800c\u6b63\u5219\u8868\u8fbe\u5f0f\u5219\u53ef\u4ee5\u5728\u63d0\u53d6\u7684\u6570\u636e\u4e2d\u67e5\u627e\u7279\u5b9a\u6a21\u5f0f\u3002\u8fd9\u79cd\u65b9\u6cd5\u53ef\u4ee5\u6709\u6548\u5730\u5904\u7406\u5e26\u6709\u5d4c\u5957\u7ed3\u6784\u7684HTML\uff0c\u6bd4\u5982\u8868\u683c\u6216\u5217\u8868\u3002\u793a\u4f8b\u4ee3\u7801\u5982\u4e0b\uff1a  <\/p>\n<pre><code class=\"language-python\">import re\n\n# \u5047\u8bbesoup\u5bf9\u8c61\u5df2\u521b\u5efa\ncontent = soup.find(&#39;div&#39;, class_=&#39;content&#39;)\nmatches = re.findall(r&#39;\\d+&#39;, content.text)  # \u627e\u5230\u6240\u6709\u6570\u5b57\nprint(matches)\n<\/code><\/pre>\n<p>\u8fd9\u79cd\u7ec4\u5408\u65b9\u6cd5\u80fd\u591f\u8ba9\u60a8\u66f4\u7075\u6d3b\u5730\u5904\u7406\u548c\u89e3\u6790\u590d\u6742\u7684HTML\u6570\u636e\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"Python\u5904\u7406HTML\u6570\u636e\u7684\u65b9\u6cd5\u6709\u5f88\u591a\u79cd\uff0c\u53ef\u4ee5\u4f7f\u7528BeautifulSoup\u8fdb\u884c\u89e3\u6790\u3001\u5229\u7528requests\u5e93 [&hellip;]","protected":false},"author":3,"featured_media":1162513,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1162508"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=1162508"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1162508\/revisions"}],"predecessor-version":[{"id":1162517,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1162508\/revisions\/1162517"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/1162513"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=1162508"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=1162508"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=1162508"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}