{"id":976390,"date":"2024-12-27T06:21:15","date_gmt":"2024-12-26T22:21:15","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/976390.html"},"modified":"2024-12-27T06:21:17","modified_gmt":"2024-12-26T22:21:17","slug":"python%e4%b8%ad%e5%a6%82%e4%bd%95%e8%b0%83%e5%8f%96pdf","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/976390.html","title":{"rendered":"Python\u4e2d\u5982\u4f55\u8c03\u53d6pdf"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-kb.worktile.com\/kb\/wp-content\/uploads\/2024\/04\/24201201\/46a49c3b-ebe0-499d-a9ff-5be0af745d65.webp\" alt=\"Python\u4e2d\u5982\u4f55\u8c03\u53d6pdf\" \/><\/p>\n<p><p> <strong>\u5728Python\u4e2d\u8c03\u53d6PDF\u53ef\u4ee5\u901a\u8fc7\u591a\u79cd\u5e93\u5b9e\u73b0\uff0c\u5982PyPDF2\u3001pdfplumber\u3001fitz\uff08PyMuPDF\uff09\u7b49\uff0c\u8fd9\u4e9b\u5e93\u5141\u8bb8\u7528\u6237\u6253\u5f00\u3001\u8bfb\u53d6\u548c\u64cd\u4f5cPDF\u6587\u4ef6\u3002PyPDF2\u9002\u5408\u7b80\u5355\u7684\u6587\u672c\u63d0\u53d6\u548c\u6587\u4ef6\u5408\u5e76\uff0cpdfplumber\u9002\u5408\u590d\u6742\u7684\u6587\u672c\u62bd\u53d6\u548c\u8868\u683c\u89e3\u6790\uff0cfitz\u5219\u63d0\u4f9b\u4e86\u66f4\u4e3a\u5f3a\u5927\u7684PDF\u9875\u9762\u56fe\u50cf\u548c\u6587\u672c\u5904\u7406\u529f\u80fd\u3002\u672c\u6587\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u5982\u4f55\u4f7f\u7528\u8fd9\u4e9b\u5e93\u5b9e\u73b0PDF\u6587\u4ef6\u7684\u8c03\u53d6\u4e0e\u5904\u7406\u3002<\/strong><\/p>\n<\/p>\n<p><p>\u4e00\u3001PYPDF2\u5e93\u7684\u4f7f\u7528<\/p>\n<\/p>\n<p><p>PyPDF2\u662f\u4e00\u4e2a\u7eafPython\u7f16\u5199\u7684PDF\u5de5\u5177\u5e93\uff0c\u652f\u6301\u5bf9PDF\u6587\u4ef6\u8fdb\u884c\u9605\u8bfb\u3001\u5206\u5272\u3001\u5408\u5e76\u3001\u88c1\u526a\u7b49\u64cd\u4f5c\u3002\u867d\u7136\u4e0d\u652f\u6301\u590d\u6742\u7684PDF\u89e3\u6790\uff0c\u4f46\u5bf9\u4e8e\u7b80\u5355\u7684\u6587\u4ef6\u5904\u7406\u8db3\u591f\u80dc\u4efb\u3002<\/p>\n<\/p>\n<ol>\n<li><strong>\u5b89\u88c5\u4e0e\u57fa\u672c\u4f7f\u7528<\/strong><\/li>\n<\/ol>\n<p><p>\u8981\u4f7f\u7528PyPDF2\uff0c\u9996\u5148\u9700\u8981\u5b89\u88c5\u8be5\u5e93\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install PyPDF2<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u5b89\u88c5\u5b8c\u6210\u540e\uff0c\u53ef\u4ee5\u901a\u8fc7\u4ee5\u4e0b\u4ee3\u7801\u8bfb\u53d6PDF\u6587\u4ef6\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import PyPDF2<\/p>\n<h2><strong>\u6253\u5f00PDF\u6587\u4ef6<\/strong><\/h2>\n<p>with open(&#39;example.pdf&#39;, &#39;rb&#39;) as file:<\/p>\n<p>    reader = PyPDF2.PdfReader(file)<\/p>\n<p>    page = reader.pages[0]<\/p>\n<p>    print(page.extract_text())<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u6b64\u4ee3\u7801\u6253\u5f00\u4e00\u4e2a\u540d\u4e3aexample.pdf\u7684\u6587\u4ef6\uff0c\u5e76\u63d0\u53d6\u7b2c\u4e00\u9875\u7684\u6587\u672c\u5185\u5bb9\u3002<\/p>\n<\/p>\n<ol start=\"2\">\n<li><strong>PDF\u7684\u5408\u5e76\u4e0e\u5206\u5272<\/strong><\/li>\n<\/ol>\n<p><p>PyPDF2\u53ef\u4ee5\u5f88\u65b9\u4fbf\u5730\u5408\u5e76\u548c\u5206\u5272PDF\u6587\u4ef6\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from PyPDF2 import PdfMerger, PdfReader, PdfWriter<\/p>\n<h2><strong>\u5408\u5e76PDF\u6587\u4ef6<\/strong><\/h2>\n<p>merger = PdfMerger()<\/p>\n<p>files = [&#39;file1.pdf&#39;, &#39;file2.pdf&#39;]<\/p>\n<p>for f in files:<\/p>\n<p>    merger.append(f)<\/p>\n<p>merger.write(&#39;merged.pdf&#39;)<\/p>\n<p>merger.close()<\/p>\n<h2><strong>\u5206\u5272PDF\u6587\u4ef6<\/strong><\/h2>\n<p>reader = PdfReader(&#39;merged.pdf&#39;)<\/p>\n<p>writer = PdfWriter()<\/p>\n<p>for page in reader.pages[:2]:  # \u4ec5\u63d0\u53d6\u524d\u4e24\u9875<\/p>\n<p>    writer.add_page(page)<\/p>\n<p>with open(&#39;split.pdf&#39;, &#39;wb&#39;) as output_file:<\/p>\n<p>    writer.write(output_file)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u4ee5\u4e0a\u4ee3\u7801\u5c55\u793a\u4e86\u5982\u4f55\u5408\u5e76\u591a\u4e2aPDF\u6587\u4ef6\uff0c\u5e76\u4ece\u5408\u5e76\u540e\u7684\u6587\u4ef6\u4e2d\u63d0\u53d6\u524d\u4e24\u9875\u751f\u6210\u4e00\u4e2a\u65b0\u6587\u4ef6\u3002<\/p>\n<\/p>\n<p><p>\u4e8c\u3001PDFPLUMBER\u5e93\u7684\u4f7f\u7528<\/p>\n<\/p>\n<p><p>pdfplumber\u662f\u4e00\u4e2a\u529f\u80fd\u5f3a\u5927\u7684PDF\u6587\u672c\u63d0\u53d6\u5de5\u5177\uff0c\u7279\u522b\u9002\u7528\u4e8e\u4ece\u590d\u6742\u5e03\u5c40\u7684PDF\u4e2d\u63d0\u53d6\u8868\u683c\u548c\u6587\u672c\u3002<\/p>\n<\/p>\n<ol>\n<li><strong>\u5b89\u88c5\u4e0e\u57fa\u672c\u4f7f\u7528<\/strong><\/li>\n<\/ol>\n<p><p>\u9996\u5148\u5b89\u88c5pdfplumber\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install pdfplumber<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u4ee5\u4e0b\u662f\u4f7f\u7528pdfplumber\u63d0\u53d6\u6587\u672c\u548c\u8868\u683c\u7684\u4f8b\u5b50\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pdfplumber<\/p>\n<h2><strong>\u6253\u5f00PDF\u6587\u4ef6<\/strong><\/h2>\n<p>with pdfplumber.open(&#39;example.pdf&#39;) as pdf:<\/p>\n<p>    page = pdf.pages[0]<\/p>\n<p>    text = page.extract_text()<\/p>\n<p>    print(text)<\/p>\n<p>    # \u63d0\u53d6\u8868\u683c<\/p>\n<p>    tables = page.extract_tables()<\/p>\n<p>    for table in tables:<\/p>\n<p>        for row in table:<\/p>\n<p>            print(row)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u8be5\u793a\u4f8b\u5c55\u793a\u4e86\u5982\u4f55\u4ecePDF\u4e2d\u63d0\u53d6\u6587\u5b57\u4ee5\u53ca\u89e3\u6790\u8868\u683c\u3002<\/p>\n<\/p>\n<ol start=\"2\">\n<li><strong>\u5904\u7406\u590d\u6742\u5e03\u5c40<\/strong><\/li>\n<\/ol>\n<p><p>pdfplumber\u53ef\u4ee5\u5904\u7406\u590d\u6742\u7684PDF\u5e03\u5c40\uff0c\u652f\u6301\u81ea\u5b9a\u4e49\u63d0\u53d6\u533a\u57df\u548c\u65cb\u8f6c\u6821\u6b63\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">with pdfplumber.open(&#39;example.pdf&#39;) as pdf:<\/p>\n<p>    page = pdf.pages[0]<\/p>\n<p>    # \u5b9a\u4e49\u533a\u57df\uff08x0, y0, x1, y1\uff09<\/p>\n<p>    bbox = (50, 50, 500, 500)<\/p>\n<p>    cropped_page = page.within_bbox(bbox)<\/p>\n<p>    # \u63d0\u53d6\u533a\u57df\u5185\u7684\u6587\u672c<\/p>\n<p>    text = cropped_page.extract_text()<\/p>\n<p>    print(text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u901a\u8fc7\u5b9a\u4e49\u5750\u6807\u6846\uff0c\u7528\u6237\u53ef\u4ee5\u63d0\u53d6PDF\u4e2d\u7279\u5b9a\u533a\u57df\u7684\u5185\u5bb9\u3002<\/p>\n<\/p>\n<p><p>\u4e09\u3001FITZ\uff08PYMUPDF\uff09\u5e93\u7684\u4f7f\u7528<\/p>\n<\/p>\n<p><p>fitz\uff08\u4e5f\u79f0\u4e3aPyMuPDF\uff09\u662f\u4e00\u4e2a\u529f\u80fd\u5f3a\u5927\u7684PDF\u5904\u7406\u5e93\uff0c\u63d0\u4f9b\u4e86PDF\u9875\u9762\u7684\u56fe\u50cf\u548c\u6587\u672c\u5904\u7406\u529f\u80fd\u3002<\/p>\n<\/p>\n<ol>\n<li><strong>\u5b89\u88c5\u4e0e\u57fa\u672c\u4f7f\u7528<\/strong><\/li>\n<\/ol>\n<p><p>\u5b89\u88c5fitz\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-bash\">pip install pymupdf<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u4ee5\u4e0b\u793a\u4f8b\u6f14\u793a\u5982\u4f55\u4f7f\u7528fitz\u8bfb\u53d6PDF\u6587\u4ef6\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import fitz  # PyMuPDF<\/p>\n<h2><strong>\u6253\u5f00PDF\u6587\u4ef6<\/strong><\/h2>\n<p>doc = fitz.open(&#39;example.pdf&#39;)<\/p>\n<p>page = doc.load_page(0)  # \u7b2c0\u9875<\/p>\n<p>text = page.get_text()<\/p>\n<p>print(text)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<ol start=\"2\">\n<li><strong>\u63d0\u53d6\u56fe\u50cf\u548c\u9875\u9762\u5904\u7406<\/strong><\/li>\n<\/ol>\n<p><p>fitz\u4e0d\u4ec5\u53ef\u4ee5\u63d0\u53d6\u6587\u672c\uff0c\u8fd8\u53ef\u4ee5\u63d0\u53d6\u9875\u9762\u4e2d\u7684\u56fe\u50cf\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u63d0\u53d6\u56fe\u50cf<\/p>\n<p>for img_index, img in enumerate(page.get_images(full=True)):<\/p>\n<p>    xref = img[0]<\/p>\n<p>    base_image = doc.extract_image(xref)<\/p>\n<p>    image_bytes = base_image[&quot;image&quot;]<\/p>\n<p>    # \u4fdd\u5b58\u56fe\u50cf\u6587\u4ef6<\/p>\n<p>    with open(f&#39;image{img_index}.png&#39;, &#39;wb&#39;) as img_file:<\/p>\n<p>        img_file.write(image_bytes)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u6b64\u5916\uff0cfitz\u8fd8\u652f\u6301\u5bf9\u9875\u9762\u8fdb\u884c\u65cb\u8f6c\u3001\u7f29\u653e\u7b49\u64cd\u4f5c\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u65cb\u8f6c\u9875\u9762<\/p>\n<p>page.set_rotation(90)<\/p>\n<h2><strong>\u4fdd\u5b58\u4fee\u6539<\/strong><\/h2>\n<p>doc.save(&#39;rotated_example.pdf&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u56db\u3001\u603b\u7ed3<\/p>\n<\/p>\n<p><p><strong>\u5728Python\u4e2d\u8c03\u53d6\u548c\u5904\u7406PDF\u7684\u591a\u6837\u5316\u9700\u6c42\u53ef\u4ee5\u901a\u8fc7\u4e0d\u540c\u7684\u5e93\u6765\u6ee1\u8db3\u3002<\/strong>\u5bf9\u4e8e\u7b80\u5355\u7684PDF\u64cd\u4f5c\uff0c\u5982\u5408\u5e76\u548c\u5206\u5272\uff0cPyPDF2\u662f\u4e00\u4e2a\u4e0d\u9519\u7684\u9009\u62e9\u3002\u5bf9\u4e8e\u590d\u6742\u7684\u6587\u672c\u63d0\u53d6\u548c\u8868\u683c\u89e3\u6790\uff0cpdfplumber\u63d0\u4f9b\u4e86\u5f3a\u5927\u7684\u529f\u80fd\u652f\u6301\u3002\u800c\u5bf9\u4e8e\u9700\u8981\u5904\u7406PDF\u4e2d\u7684\u56fe\u50cf\u548c\u6267\u884c\u9875\u9762\u7ea7\u522b\u64cd\u4f5c\u7684\u60c5\u51b5\uff0cfitz\uff08PyMuPDF\uff09\u5219\u662f\u4e00\u4e2a\u7406\u60f3\u7684\u9009\u62e9\u3002\u6839\u636e\u5177\u4f53\u9700\u6c42\u9009\u62e9\u5408\u9002\u7684\u5de5\u5177\uff0c\u53ef\u4ee5\u5927\u5927\u63d0\u9ad8PDF\u5904\u7406\u7684\u6548\u7387\u548c\u6548\u679c\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u4f7f\u7528Python\u8bfb\u53d6PDF\u6587\u4ef6\u7684\u5185\u5bb9\uff1f<\/strong><br \/>\u5728Python\u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528\u591a\u4e2a\u5e93\u6765\u8bfb\u53d6PDF\u6587\u4ef6\uff0c\u4f8b\u5982PyPDF2\u3001PDFMiner\u548cPyMuPDF\u7b49\u3002PyPDF2\u662f\u4e00\u4e2a\u975e\u5e38\u6d41\u884c\u7684\u9009\u62e9\uff0c\u53ef\u4ee5\u901a\u8fc7\u4ee5\u4e0b\u65b9\u5f0f\u8bfb\u53d6PDF\u5185\u5bb9\uff1a  <\/p>\n<pre><code class=\"language-python\">import PyPDF2\n\nwith open(&#39;example.pdf&#39;, &#39;rb&#39;) as file:\n    reader = PyPDF2.PdfReader(file)\n    for page in reader.pages:\n        print(page.extract_text())\n<\/code><\/pre>\n<p>\u901a\u8fc7\u8fd9\u79cd\u65b9\u5f0f\uff0c\u53ef\u4ee5\u9010\u9875\u63d0\u53d6\u6587\u672c\u5185\u5bb9\uff0c\u65b9\u4fbf\u8fdb\u884c\u540e\u7eed\u5206\u6790\u6216\u5904\u7406\u3002<\/p>\n<p><strong>Python\u4e2d\u6709\u54ea\u4e9b\u5e93\u9002\u5408\u5904\u7406PDF\u6587\u4ef6\uff1f<\/strong><br \/>\u5904\u7406PDF\u6587\u4ef6\u7684\u5e93\u6709\u5f88\u591a\uff0c\u4ee5\u4e0b\u662f\u4e00\u4e9b\u5e38\u7528\u7684\u5e93\uff1a  <\/p>\n<ol>\n<li><strong>PyPDF2<\/strong>\uff1a\u9002\u5408\u57fa\u672c\u7684PDF\u8bfb\u53d6\u548c\u5408\u5e76\u64cd\u4f5c\u3002  <\/li>\n<li><strong>PDFMiner<\/strong>\uff1a\u66f4\u9002\u5408\u4ecePDF\u4e2d\u63d0\u53d6\u590d\u6742\u5e03\u5c40\u7684\u6587\u672c\u548c\u56fe\u50cf\u3002  <\/li>\n<li><strong>PyMuPDF\uff08fitz\uff09<\/strong>\uff1a\u652f\u6301\u591a\u79cd\u683c\u5f0f\uff0c\u6027\u80fd\u4f18\u826f\uff0c\u9002\u5408\u56fe\u50cf\u5904\u7406\u548c\u6e32\u67d3\u3002  <\/li>\n<li><strong>reportlab<\/strong>\uff1a\u4e3b\u8981\u7528\u4e8e\u751f\u6210PDF\u6587\u4ef6\uff0c\u800c\u4e0d\u662f\u8bfb\u53d6\u3002<br \/>\u6839\u636e\u9700\u6c42\u9009\u62e9\u5408\u9002\u7684\u5e93\u53ef\u4ee5\u5927\u5927\u63d0\u9ad8\u5904\u7406\u6548\u7387\u3002<\/li>\n<\/ol>\n<p><strong>\u5982\u4f55\u5728Python\u4e2d\u5c06PDF\u6587\u4ef6\u8f6c\u6362\u4e3a\u5176\u4ed6\u683c\u5f0f\uff1f<\/strong><br \/>\u8f6c\u6362PDF\u6587\u4ef6\u4e3a\u5176\u4ed6\u683c\u5f0f\uff0c\u5982TXT\u6216\u56fe\u7247\uff0c\u53ef\u4ee5\u4f7f\u7528\u4e0d\u540c\u7684\u5e93\u3002\u4f8b\u5982\uff0c\u4f7f\u7528PDFMiner\u5c06PDF\u8f6c\u6362\u4e3a\u6587\u672c\uff0c\u6216\u4f7f\u7528Pillow\u5e93\u5c06PDF\u9875\u9762\u8f6c\u4e3a\u56fe\u7247\u3002\u4ee3\u7801\u793a\u4f8b\uff1a  <\/p>\n<pre><code class=\"language-python\">from pdf2image import convert_from_path\n\nimages = convert_from_path(&#39;example.pdf&#39;)\nfor i, image in enumerate(images):\n    image.save(f&#39;page_{i + 1}.jpg&#39;, &#39;JPEG&#39;)\n<\/code><\/pre>\n<p>\u8fd9\u79cd\u65b9\u5f0f\u53ef\u4ee5\u5c06\u6bcf\u4e00\u9875PDF\u8f6c\u4e3a\u72ec\u7acb\u7684JPEG\u56fe\u7247\uff0c\u4fbf\u4e8e\u67e5\u770b\u548c\u5206\u4eab\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u5728Python\u4e2d\u8c03\u53d6PDF\u53ef\u4ee5\u901a\u8fc7\u591a\u79cd\u5e93\u5b9e\u73b0\uff0c\u5982PyPDF2\u3001pdfplumber\u3001fitz\uff08PyMuPDF\uff09 [&hellip;]","protected":false},"author":3,"featured_media":976395,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/976390"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=976390"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/976390\/revisions"}],"predecessor-version":[{"id":976397,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/976390\/revisions\/976397"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/976395"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=976390"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=976390"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=976390"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}