{"id":1025080,"date":"2024-12-30T14:19:29","date_gmt":"2024-12-30T06:19:29","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/1025080.html"},"modified":"2024-12-30T14:19:31","modified_gmt":"2024-12-30T06:19:31","slug":"%e5%a6%82%e4%bd%95%e7%94%a8python%e6%8a%93%e5%8f%96%e5%9b%be%e7%89%87-2","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/1025080.html","title":{"rendered":"\u5982\u4f55\u7528python\u6293\u53d6\u56fe\u7247"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-docs.pingcode.com\/wp-content\/uploads\/2024\/12\/32013d76-5a5b-462b-a8a5-9dc6ed3cb48d.webp?x-oss-process=image\/auto-orient,1\/format,webp\" alt=\"\u5982\u4f55\u7528python\u6293\u53d6\u56fe\u7247\" \/><\/p>\n<p><p> <strong>\u4f7f\u7528Python\u6293\u53d6\u56fe\u7247\u7684\u65b9\u6cd5\u6709\u5f88\u591a\u79cd\uff0c\u5305\u62ec\u4f7f\u7528\u5e93\u5982requests\u3001BeautifulSoup\u548cScrapy\u7b49\u3002\u5e38\u7528\u7684\u65b9\u6cd5\u6709\uff1arequests\u5e93\u53d1\u9001HTTP\u8bf7\u6c42\u3001BeautifulSoup\u89e3\u6790HTML\u3001Scrapy\u8fdb\u884c\u5927\u89c4\u6a21\u6293\u53d6\u3002\u4e0b\u9762\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u4f7f\u7528requests\u548cBeautifulSoup\u6293\u53d6\u56fe\u7247\u7684\u65b9\u6cd5\u3002<\/strong><\/p>\n<\/p>\n<p><h3>\u4e00\u3001\u4f7f\u7528requests\u5e93\u6293\u53d6\u56fe\u7247<\/h3>\n<\/p>\n<p><p>requests\u5e93\u662fPython\u4e2d\u7528\u4e8e\u53d1\u9001HTTP\u8bf7\u6c42\u7684\u5e93\u3002\u5b83\u7b80\u5355\u6613\u7528\uff0c\u53ef\u4ee5\u65b9\u4fbf\u5730\u6293\u53d6\u7f51\u9875\u4e2d\u7684\u56fe\u7247\u3002<\/p>\n<\/p>\n<p><h4>1. \u5b89\u88c5requests\u5e93<\/h4>\n<\/p>\n<p><p>\u9996\u5148\u9700\u8981\u5b89\u88c5requests\u5e93\uff0c\u53ef\u4ee5\u4f7f\u7528\u4ee5\u4e0b\u547d\u4ee4\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install requests<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u53d1\u9001HTTP\u8bf7\u6c42<\/h4>\n<\/p>\n<p><p>\u4f7f\u7528requests\u5e93\u53d1\u9001HTTP\u8bf7\u6c42\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import requests<\/p>\n<p>url = &#39;https:\/\/example.com&#39;<\/p>\n<p>response = requests.get(url)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. \u89e3\u6790\u7f51\u9875\u5185\u5bb9<\/h4>\n<\/p>\n<p><p>\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\u540e\uff0c\u9700\u8981\u89e3\u6790HTML\uff0c\u627e\u5230\u56fe\u7247\u7684URL\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from bs4 import BeautifulSoup<\/p>\n<p>soup = BeautifulSoup(response.text, &#39;html.parser&#39;)<\/p>\n<p>images = soup.find_all(&#39;img&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>4. \u4e0b\u8f7d\u56fe\u7247<\/h4>\n<\/p>\n<p><p>\u904d\u5386\u6240\u6709\u7684img\u6807\u7b7e\uff0c\u83b7\u53d6\u56fe\u7247\u7684src\u5c5e\u6027\u5e76\u4e0b\u8f7d\u56fe\u7247\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import os<\/p>\n<p>if not os.path.exists(&#39;images&#39;):<\/p>\n<p>    os.makedirs(&#39;images&#39;)<\/p>\n<p>for img in images:<\/p>\n<p>    img_url = img[&#39;src&#39;]<\/p>\n<p>    img_response = requests.get(img_url)<\/p>\n<p>    img_name = os.path.join(&#39;images&#39;, os.path.basename(img_url))<\/p>\n<p>    with open(img_name, &#39;wb&#39;) as f:<\/p>\n<p>        f.write(img_response.content)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u3001\u4f7f\u7528BeautifulSoup\u89e3\u6790HTML<\/h3>\n<\/p>\n<p><p>BeautifulSoup\u662f\u4e00\u4e2a\u7528\u4e8e\u89e3\u6790HTML\u548cXML\u6587\u6863\u7684\u5e93\u3002\u5b83\u53ef\u4ee5\u65b9\u4fbf\u5730\u63d0\u53d6\u6587\u6863\u4e2d\u7684\u6570\u636e\u3002<\/p>\n<\/p>\n<p><h4>1. \u5b89\u88c5BeautifulSoup\u5e93<\/h4>\n<\/p>\n<p><p>\u53ef\u4ee5\u4f7f\u7528\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5BeautifulSoup\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install beautifulsoup4<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u89e3\u6790HTML\u6587\u6863<\/h4>\n<\/p>\n<p><p>\u4f7f\u7528BeautifulSoup\u89e3\u6790HTML\u6587\u6863\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from bs4 import BeautifulSoup<\/p>\n<p>soup = BeautifulSoup(response.text, &#39;html.parser&#39;)<\/p>\n<p>images = soup.find_all(&#39;img&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. \u83b7\u53d6\u56fe\u7247URL\u5e76\u4e0b\u8f7d<\/h4>\n<\/p>\n<p><p>\u627e\u5230\u6240\u6709\u7684img\u6807\u7b7e\u540e\uff0c\u83b7\u53d6\u6bcf\u4e2aimg\u6807\u7b7e\u7684src\u5c5e\u6027\uff0c\u5e76\u4e0b\u8f7d\u56fe\u7247\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import os<\/p>\n<p>if not os.path.exists(&#39;images&#39;):<\/p>\n<p>    os.makedirs(&#39;images&#39;)<\/p>\n<p>for img in images:<\/p>\n<p>    img_url = img[&#39;src&#39;]<\/p>\n<p>    if not img_url.startswith(&#39;http&#39;):<\/p>\n<p>        img_url = url + img_url<\/p>\n<p>    img_response = requests.get(img_url)<\/p>\n<p>    img_name = os.path.join(&#39;images&#39;, os.path.basename(img_url))<\/p>\n<p>    with open(img_name, &#39;wb&#39;) as f:<\/p>\n<p>        f.write(img_response.content)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e09\u3001\u4f7f\u7528Scrapy\u8fdb\u884c\u5927\u89c4\u6a21\u6293\u53d6<\/h3>\n<\/p>\n<p><p>Scrapy\u662f\u4e00\u4e2a\u5f3a\u5927\u7684Python\u7f51\u7edc\u6293\u53d6\u6846\u67b6\uff0c\u9002\u7528\u4e8e\u5927\u89c4\u6a21\u6293\u53d6\u4efb\u52a1\u3002<\/p>\n<\/p>\n<p><h4>1. \u5b89\u88c5Scrapy\u5e93<\/h4>\n<\/p>\n<p><p>\u53ef\u4ee5\u4f7f\u7528\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5Scrapy\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install scrapy<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2. \u521b\u5efaScrapy\u9879\u76ee<\/h4>\n<\/p>\n<p><p>\u4f7f\u7528\u4ee5\u4e0b\u547d\u4ee4\u521b\u5efaScrapy\u9879\u76ee\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">scrapy startproject image_scraper<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. \u5b9a\u4e49\u722c\u866b<\/h4>\n<\/p>\n<p><p>\u5728\u9879\u76ee\u76ee\u5f55\u4e0b\u521b\u5efa\u4e00\u4e2a\u722c\u866b\uff0c\u7f16\u8f91<code>spiders<\/code>\u76ee\u5f55\u4e0b\u7684\u6587\u4ef6\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import scrapy<\/p>\n<p>class ImageSpider(scrapy.Spider):<\/p>\n<p>    name = &#39;image_spider&#39;<\/p>\n<p>    start_urls = [&#39;https:\/\/example.com&#39;]<\/p>\n<p>    def parse(self, response):<\/p>\n<p>        for img in response.css(&#39;img&#39;):<\/p>\n<p>            img_url = img.attrib[&#39;src&#39;]<\/p>\n<p>            yield {&#39;image_url&#39;: img_url}<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>4. \u4fdd\u5b58\u56fe\u7247<\/h4>\n<\/p>\n<p><p>\u5728\u722c\u866b\u4e2d\u5b9a\u4e49\u4e00\u4e2a\u65b9\u6cd5\u6765\u4fdd\u5b58\u56fe\u7247\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import scrapy<\/p>\n<p>import os<\/p>\n<p>class ImageSpider(scrapy.Spider):<\/p>\n<p>    name = &#39;image_spider&#39;<\/p>\n<p>    start_urls = [&#39;https:\/\/example.com&#39;]<\/p>\n<p>    def parse(self, response):<\/p>\n<p>        if not os.path.exists(&#39;images&#39;):<\/p>\n<p>            os.makedirs(&#39;images&#39;)<\/p>\n<p>        for img in response.css(&#39;img&#39;):<\/p>\n<p>            img_url = img.attrib[&#39;src&#39;]<\/p>\n<p>            if not img_url.startswith(&#39;http&#39;):<\/p>\n<p>                img_url = response.urljoin(img_url)<\/p>\n<p>            img_name = os.path.join(&#39;images&#39;, os.path.basename(img_url))<\/p>\n<p>            yield scrapy.Request(img_url, callback=self.save_image, meta={&#39;img_name&#39;: img_name})<\/p>\n<p>    def save_image(self, response):<\/p>\n<p>        img_name = response.meta[&#39;img_name&#39;]<\/p>\n<p>        with open(img_name, &#39;wb&#39;) as f:<\/p>\n<p>            f.write(response.body)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>5. \u8fd0\u884c\u722c\u866b<\/h4>\n<\/p>\n<p><p>\u4f7f\u7528\u4ee5\u4e0b\u547d\u4ee4\u8fd0\u884c\u722c\u866b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">scrapy crawl image_spider<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u56db\u3001\u6ce8\u610f\u4e8b\u9879<\/h3>\n<\/p>\n<p><h4>1. \u907f\u514d\u8fdd\u53cd\u7f51\u7ad9\u7684robots.txt<\/h4>\n<\/p>\n<p><p>\u5728\u6293\u53d6\u56fe\u7247\u65f6\uff0c\u8981\u6ce8\u610f\u907f\u514d\u8fdd\u53cd\u7f51\u7ad9\u7684robots.txt\u89c4\u5219\u3002\u53ef\u4ee5\u4f7f\u7528robots.txt\u6587\u4ef6\u6765\u4e86\u89e3\u7f51\u7ad9\u662f\u5426\u5141\u8bb8\u6293\u53d6\u56fe\u7247\u3002<\/p>\n<\/p>\n<p><h4>2. \u8bbe\u7f6e\u5408\u7406\u7684\u6293\u53d6\u9891\u7387<\/h4>\n<\/p>\n<p><p>\u907f\u514d\u5bf9\u7f51\u7ad9\u9020\u6210\u8fc7\u5927\u7684\u8d1f\u8f7d\uff0c\u53ef\u4ee5\u8bbe\u7f6e\u5408\u7406\u7684\u6293\u53d6\u9891\u7387\u3002\u4f7f\u7528time.sleep()\u51fd\u6570\u53ef\u4ee5\u5728\u6bcf\u6b21\u8bf7\u6c42\u4e4b\u95f4\u8bbe\u7f6e\u7b49\u5f85\u65f6\u95f4\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import time<\/p>\n<p>time.sleep(2)  # \u7b49\u5f852\u79d2<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3. \u5904\u7406\u5f02\u5e38\u60c5\u51b5<\/h4>\n<\/p>\n<p><p>\u5728\u6293\u53d6\u56fe\u7247\u65f6\uff0c\u53ef\u80fd\u4f1a\u9047\u5230\u5404\u79cd\u5f02\u5e38\u60c5\u51b5\uff0c\u5982\u7f51\u7edc\u9519\u8bef\u3001\u56fe\u7247\u4e0d\u5b58\u5728\u7b49\u3002\u53ef\u4ee5\u4f7f\u7528try-except\u8bed\u53e5\u6765\u5904\u7406\u8fd9\u4e9b\u5f02\u5e38\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">try:<\/p>\n<p>    img_response = requests.get(img_url)<\/p>\n<p>    img_response.r<a href=\"https:\/\/docs.pingcode.com\/blog\/59162.html\" target=\"_blank\">AI<\/a>se_for_status()  # \u68c0\u67e5\u662f\u5426\u6709\u8bf7\u6c42\u9519\u8bef<\/p>\n<p>except requests.exceptions.RequestException as e:<\/p>\n<p>    print(f&quot;Error downloading {img_url}: {e}&quot;)<\/p>\n<p>    continue<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e94\u3001\u603b\u7ed3<\/h3>\n<\/p>\n<p><p>\u4f7f\u7528Python\u6293\u53d6\u56fe\u7247\u7684\u65b9\u6cd5\u6709\u5f88\u591a\u79cd\uff0c\u5305\u62ec\u4f7f\u7528requests\u5e93\u53d1\u9001HTTP\u8bf7\u6c42\u3001\u4f7f\u7528BeautifulSoup\u89e3\u6790HTML\u3001\u4f7f\u7528Scrapy\u8fdb\u884c\u5927\u89c4\u6a21\u6293\u53d6\u7b49\u3002\u901a\u8fc7\u5408\u7406\u9009\u62e9\u65b9\u6cd5\u5e76\u6ce8\u610f\u4e00\u4e9b\u6293\u53d6\u65f6\u7684\u6ce8\u610f\u4e8b\u9879\uff0c\u53ef\u4ee5\u9ad8\u6548\u5730\u6293\u53d6\u7f51\u9875\u4e2d\u7684\u56fe\u7247\u3002\u5728\u5b9e\u9645\u5e94\u7528\u4e2d\uff0c\u53ef\u4ee5\u6839\u636e\u5177\u4f53\u9700\u6c42\u9009\u62e9\u5408\u9002\u7684\u65b9\u6cd5\u8fdb\u884c\u56fe\u7247\u6293\u53d6\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u9009\u62e9\u5408\u9002\u7684\u5e93\u6765\u6293\u53d6\u56fe\u7247\uff1f<\/strong><br \/>\u5728\u4f7f\u7528Python\u6293\u53d6\u56fe\u7247\u65f6\uff0c\u9009\u62e9\u5408\u9002\u7684\u5e93\u81f3\u5173\u91cd\u8981\u3002\u5e38\u7528\u7684\u5e93\u5305\u62ecRequests\u548cBeautifulSoup\u3002Requests\u7528\u4e8e\u53d1\u9001HTTP\u8bf7\u6c42\uff0c\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\uff1b\u800cBeautifulSoup\u5219\u7528\u6765\u89e3\u6790HTML\u548cXML\u6587\u6863\uff0c\u65b9\u4fbf\u63d0\u53d6\u56fe\u7247\u7684URL\u3002\u6b64\u5916\uff0cScrapy\u662f\u4e00\u4e2a\u529f\u80fd\u5f3a\u5927\u7684\u6846\u67b6\uff0c\u9002\u5408\u8fdb\u884c\u5927\u89c4\u6a21\u7684\u7f51\u9875\u6293\u53d6\uff0c\u80fd\u591f\u6709\u6548\u7ba1\u7406\u8bf7\u6c42\u548c\u6570\u636e\u5b58\u50a8\u3002<\/p>\n<p><strong>\u5728\u6293\u53d6\u56fe\u7247\u65f6\uff0c\u6709\u54ea\u4e9b\u9700\u8981\u6ce8\u610f\u7684\u6cd5\u5f8b\u95ee\u9898\uff1f<\/strong><br \/>\u6293\u53d6\u56fe\u7247\u65f6\uff0c\u9700\u4e86\u89e3\u76f8\u5173\u7684\u6cd5\u5f8b\u95ee\u9898\u3002\u4f8b\u5982\uff0c\u67d0\u4e9b\u56fe\u7247\u53d7\u5230\u7248\u6743\u4fdd\u62a4\uff0c\u672a\u7ecf\u6388\u6743\u4f7f\u7528\u53ef\u80fd\u5f15\u53d1\u6cd5\u5f8b\u7ea0\u7eb7\u3002\u5efa\u8bae\u5728\u6293\u53d6\u524d\u4ed4\u7ec6\u9605\u8bfb\u76ee\u6807\u7f51\u7ad9\u7684\u4f7f\u7528\u6761\u6b3e\uff0c\u786e\u4fdd\u9075\u5faa\u7f51\u7ad9\u7684robots.txt\u6587\u4ef6\u4e2d\u7684\u6293\u53d6\u89c4\u5219\u3002\u6b64\u5916\uff0c\u4f7f\u7528\u516c\u5171\u9886\u57df\u6216\u5f00\u6e90\u7684\u56fe\u7247\u8d44\u6e90\u4e5f\u662f\u4e00\u4e2a\u5b89\u5168\u7684\u9009\u62e9\u3002<\/p>\n<p><strong>\u5982\u4f55\u786e\u4fdd\u6293\u53d6\u7684\u56fe\u7247\u8d28\u91cf\u548c\u683c\u5f0f\uff1f<\/strong><br \/>\u6293\u53d6\u56fe\u7247\u65f6\uff0c\u786e\u4fdd\u6240\u6293\u53d6\u7684\u56fe\u7247\u8d28\u91cf\u548c\u683c\u5f0f\u53ef\u4ee5\u901a\u8fc7\u68c0\u67e5HTTP\u54cd\u5e94\u5934\u4e2d\u7684Content-Type\u6765\u5b9e\u73b0\u3002\u901a\u5e38\uff0cJPEG\u548cPNG\u662f\u5e38\u89c1\u7684\u56fe\u7247\u683c\u5f0f\u3002\u5728\u6293\u53d6\u8fc7\u7a0b\u4e2d\uff0c\u53ef\u4ee5\u6839\u636e\u9700\u8981\u9009\u62e9\u7279\u5b9a\u7684\u683c\u5f0f\u8fdb\u884c\u4e0b\u8f7d\u3002\u6b64\u5916\uff0c\u5efa\u8bae\u5728\u4fdd\u5b58\u56fe\u7247\u65f6\u4f7f\u7528\u9002\u5f53\u7684\u6587\u4ef6\u540d\u548c\u8def\u5f84\uff0c\u4ee5\u4fbf\u540e\u7eed\u7ba1\u7406\u548c\u8bbf\u95ee\u3002\u540c\u65f6\uff0c\u8003\u8651\u4f7f\u7528\u56fe\u50cf\u5904\u7406\u5e93\uff08\u5982PIL\u6216OpenCV\uff09\u6765\u8c03\u6574\u56fe\u7247\u7684\u5927\u5c0f\u548c\u8d28\u91cf\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u4f7f\u7528Python\u6293\u53d6\u56fe\u7247\u7684\u65b9\u6cd5\u6709\u5f88\u591a\u79cd\uff0c\u5305\u62ec\u4f7f\u7528\u5e93\u5982requests\u3001BeautifulSoup\u548cScrapy [&hellip;]","protected":false},"author":3,"featured_media":1025091,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1025080"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=1025080"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1025080\/revisions"}],"predecessor-version":[{"id":1025094,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1025080\/revisions\/1025094"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/1025091"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=1025080"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=1025080"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=1025080"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}