{"id":1045626,"date":"2024-12-31T13:25:14","date_gmt":"2024-12-31T05:25:14","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/1045626.html"},"modified":"2024-12-31T13:25:17","modified_gmt":"2024-12-31T05:25:17","slug":"python%e5%a6%82%e4%bd%95%e8%bf%9b%e8%a1%8c%e5%af%b9%e4%b8%ad%e6%96%87%e7%9a%84%e7%ad%9b%e9%80%89","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/1045626.html","title":{"rendered":"python\u5982\u4f55\u8fdb\u884c\u5bf9\u4e2d\u6587\u7684\u7b5b\u9009"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-docs.pingcode.com\/wp-content\/uploads\/2024\/12\/96937028-c120-4e45-b899-ecbe2318e258.webp?x-oss-process=image\/auto-orient,1\/format,webp\" alt=\"python\u5982\u4f55\u8fdb\u884c\u5bf9\u4e2d\u6587\u7684\u7b5b\u9009\" \/><\/p>\n<p><p> \u5728Python\u4e2d\u8fdb\u884c\u5bf9\u4e2d\u6587\u7684\u7b5b\u9009\u6709\u591a\u79cd\u65b9\u6cd5\uff0c\u4ee5\u4e0b\u662f\u51e0\u79cd\u5e38\u89c1\u7684\u65b9\u5f0f\uff1a<strong>\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u3001\u5229\u7528Unicode\u8303\u56f4\u3001\u7ed3\u5408\u6587\u672c\u5904\u7406\u5e93\u3001\u5229\u7528NLP\u5de5\u5177<\/strong>\u3002\u5176\u4e2d\uff0c\u5229\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u662f\u4e00\u79cd\u76f4\u63a5\u4e14\u9ad8\u6548\u7684\u65b9\u6cd5\uff0c\u53ef\u4ee5\u901a\u8fc7\u5339\u914d\u4e2d\u6587\u5b57\u7b26\u7684Unicode\u8303\u56f4\u6765\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<\/p>\n<p><h3>\u4e00\u3001\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f<\/h3>\n<\/p>\n<p><p>\u6b63\u5219\u8868\u8fbe\u5f0f\u662f\u4e00\u79cd\u5f3a\u5927\u7684\u6587\u672c\u5904\u7406\u5de5\u5177\u3002\u5728Python\u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528 <code>re<\/code> \u6a21\u5757\u6765\u8fdb\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u64cd\u4f5c\u3002\u4e2d\u6587\u5b57\u7b26\u7684Unicode\u8303\u56f4\u662f <code>\\u4e00-\\u9fa5<\/code>\uff0c\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u8fd9\u4e2a\u8303\u56f4\u6765\u7b5b\u9009\u51fa\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import re<\/p>\n<p>def filter_chinese(text):<\/p>\n<p>    pattern = re.compile(r&#39;[\\u4e00-\\u9fa5]+&#39;)<\/p>\n<p>    result = pattern.findall(text)<\/p>\n<p>    return &#39;&#39;.join(result)<\/p>\n<p>text = &quot;Hello, \u4f60\u597d, \u4e16\u754c!&quot;<\/p>\n<p>filtered_text = filter_chinese(text)<\/p>\n<p>print(filtered_text)  # \u8f93\u51fa: \u4f60\u597d\u4e16\u754c<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u901a\u8fc7\u4e0a\u8ff0\u65b9\u6cd5\uff0c\u6211\u4eec\u53ef\u4ee5\u8f7b\u677e\u5730\u4ece\u5b57\u7b26\u4e32\u4e2d\u7b5b\u9009\u51fa\u4e2d\u6587\u5b57\u7b26\uff0c\u5e76\u5c06\u5b83\u4eec\u62fc\u63a5\u6210\u4e00\u4e2a\u65b0\u7684\u5b57\u7b26\u4e32\u3002<\/p>\n<\/p>\n<p><h3>\u4e8c\u3001\u5229\u7528Unicode\u8303\u56f4<\/h3>\n<\/p>\n<p><p>\u9664\u4e86\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\uff0c\u6211\u4eec\u8fd8\u53ef\u4ee5\u76f4\u63a5\u5229\u7528Unicode\u8303\u56f4\u6765\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u3002\u901a\u8fc7\u5224\u65ad\u5b57\u7b26\u7684Unicode\u7f16\u7801\u662f\u5426\u5728\u7279\u5b9a\u8303\u56f4\u5185\uff0c\u53ef\u4ee5\u7b5b\u9009\u51fa\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">def is_chinese(char):<\/p>\n<p>    return &#39;\\u4e00&#39; &lt;= char &lt;= &#39;\\u9fa5&#39;<\/p>\n<p>def filter_chinese(text):<\/p>\n<p>    return &#39;&#39;.join([char for char in text if is_chinese(char)])<\/p>\n<p>text = &quot;Hello, \u4f60\u597d, \u4e16\u754c!&quot;<\/p>\n<p>filtered_text = filter_chinese(text)<\/p>\n<p>print(filtered_text)  # \u8f93\u51fa: \u4f60\u597d\u4e16\u754c<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u8fd9\u79cd\u65b9\u6cd5\u867d\u7136\u6ca1\u6709\u6b63\u5219\u8868\u8fbe\u5f0f\u90a3\u4e48\u7b80\u6d01\uff0c\u4f46\u4e5f\u975e\u5e38\u76f4\u89c2\uff0c\u53ef\u4ee5\u76f4\u63a5\u5224\u65ad\u6bcf\u4e2a\u5b57\u7b26\u662f\u5426\u4e3a\u4e2d\u6587\u3002<\/p>\n<\/p>\n<p><h3>\u4e09\u3001\u7ed3\u5408\u6587\u672c\u5904\u7406\u5e93<\/h3>\n<\/p>\n<p><p>Python\u6709\u8bb8\u591a\u5f3a\u5927\u7684\u6587\u672c\u5904\u7406\u5e93\uff0c\u5982 <code>jieba<\/code> \u548c <code>SnowNLP<\/code>\uff0c\u8fd9\u4e9b\u5e93\u4e0d\u4ec5\u53ef\u4ee5\u8fdb\u884c\u4e2d\u6587\u5206\u8bcd\uff0c\u8fd8\u53ef\u4ee5\u7528\u4e8e\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<\/p>\n<p><h4>\u4f7f\u7528jieba\u5e93<\/h4>\n<\/p>\n<p><p><code>jieba<\/code> \u662f\u4e00\u4e2a\u975e\u5e38\u6d41\u884c\u7684\u4e2d\u6587\u5206\u8bcd\u5e93\uff0c\u53ef\u4ee5\u7528\u6765\u5206\u8bcd\u5e76\u7b5b\u9009\u51fa\u4e2d\u6587\u8bcd\u8bed\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import jieba<\/p>\n<p>def filter_chinese(text):<\/p>\n<p>    words = jieba.lcut(text)<\/p>\n<p>    return &#39;&#39;.join([word for word in words if all(&#39;\\u4e00&#39; &lt;= char &lt;= &#39;\\u9fa5&#39; for char in word)])<\/p>\n<p>text = &quot;Hello, \u4f60\u597d, \u4e16\u754c!&quot;<\/p>\n<p>filtered_text = filter_chinese(text)<\/p>\n<p>print(filtered_text)  # \u8f93\u51fa: \u4f60\u597d\u4e16\u754c<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>\u4f7f\u7528SnowNLP\u5e93<\/h4>\n<\/p>\n<p><p><code>SnowNLP<\/code> \u4e5f\u662f\u4e00\u4e2a\u7528\u4e8e\u5904\u7406\u4e2d\u6587\u6587\u672c\u7684\u5e93\uff0c\u53ef\u4ee5\u7528\u4e8e\u60c5\u611f\u5206\u6790\u3001\u6458\u8981\u63d0\u53d6\u7b49\u4efb\u52a1\u3002\u6211\u4eec\u4e5f\u53ef\u4ee5\u7528\u5b83\u6765\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from snownlp import SnowNLP<\/p>\n<p>def filter_chinese(text):<\/p>\n<p>    s = SnowNLP(text)<\/p>\n<p>    return &#39;&#39;.join([word for word in s.words if all(&#39;\\u4e00&#39; &lt;= char &lt;= &#39;\\u9fa5&#39; for char in word)])<\/p>\n<p>text = &quot;Hello, \u4f60\u597d, \u4e16\u754c!&quot;<\/p>\n<p>filtered_text = filter_chinese(text)<\/p>\n<p>print(filtered_text)  # \u8f93\u51fa: \u4f60\u597d\u4e16\u754c<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u56db\u3001\u5229\u7528NLP\u5de5\u5177<\/h3>\n<\/p>\n<p><p>\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u5de5\u5177\u53ef\u4ee5\u63d0\u4f9b\u66f4\u9ad8\u7ea7\u7684\u6587\u672c\u5904\u7406\u529f\u80fd\u3002\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528NLP\u5de5\u5177\u6765\u8fdb\u884c\u4e2d\u6587\u7b5b\u9009\uff0c\u5e76\u7ed3\u5408\u5176\u4ed6\u5904\u7406\u529f\u80fd\u8fdb\u884c\u66f4\u590d\u6742\u7684\u6587\u672c\u5206\u6790\u3002<\/p>\n<\/p>\n<p><h4>\u4f7f\u7528spaCy\u5e93<\/h4>\n<\/p>\n<p><p><code>spaCy<\/code> \u662f\u4e00\u4e2a\u975e\u5e38\u5f3a\u5927\u7684NLP\u5e93\uff0c\u867d\u7136\u5b83\u4e3b\u8981\u7528\u4e8e\u5904\u7406\u82f1\u6587\u6587\u672c\uff0c\u4f46\u4e5f\u652f\u6301\u4e2d\u6587\u5904\u7406\u3002\u6211\u4eec\u53ef\u4ee5\u5229\u7528\u5b83\u7684\u5206\u8bcd\u529f\u80fd\u6765\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import spacy<\/p>\n<h2><strong>\u9700\u8981\u4e0b\u8f7d\u4e2d\u6587\u6a21\u578b<\/strong><\/h2>\n<h2><strong>pip install spacy<\/strong><\/h2>\n<h2><strong>python -m spacy download zh_core_web_sm<\/strong><\/h2>\n<p>nlp = spacy.load(&quot;zh_core_web_sm&quot;)<\/p>\n<p>def filter_chinese(text):<\/p>\n<p>    doc = nlp(text)<\/p>\n<p>    return &#39;&#39;.join([token.text for token in doc if all(&#39;\\u4e00&#39; &lt;= char &lt;= &#39;\\u9fa5&#39; for char in token.text)])<\/p>\n<p>text = &quot;Hello, \u4f60\u597d, \u4e16\u754c!&quot;<\/p>\n<p>filtered_text = filter_chinese(text)<\/p>\n<p>print(filtered_text)  # \u8f93\u51fa: \u4f60\u597d\u4e16\u754c<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u603b\u7ed3<\/h3>\n<\/p>\n<p><p>\u5728Python\u4e2d\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u7684\u65b9\u6cd5\u6709\u5f88\u591a\u79cd\uff0c<strong>\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u3001\u5229\u7528Unicode\u8303\u56f4\u3001\u7ed3\u5408\u6587\u672c\u5904\u7406\u5e93\u3001\u5229\u7528NLP\u5de5\u5177<\/strong>\u662f\u51e0\u79cd\u5e38\u89c1\u4e14\u6709\u6548\u7684\u65b9\u6cd5\u3002\u6bcf\u79cd\u65b9\u6cd5\u90fd\u6709\u5176\u72ec\u7279\u7684\u4f18\u52bf\u548c\u9002\u7528\u573a\u666f\uff0c\u9009\u62e9\u5408\u9002\u7684\u65b9\u6cd5\u53ef\u4ee5\u6839\u636e\u5177\u4f53\u7684\u9700\u6c42\u548c\u9879\u76ee\u80cc\u666f\u6765\u51b3\u5b9a\u3002\u6b63\u5219\u8868\u8fbe\u5f0f\u548cUnicode\u8303\u56f4\u7684\u65b9\u6cd5\u9002\u7528\u4e8e\u7b80\u5355\u7684\u6587\u672c\u5904\u7406\u4efb\u52a1\uff0c\u800c\u6587\u672c\u5904\u7406\u5e93\u548cNLP\u5de5\u5177\u5219\u53ef\u4ee5\u7528\u4e8e\u66f4\u590d\u6742\u7684\u6587\u672c\u5206\u6790\u548c\u5904\u7406\u4efb\u52a1\u3002\u901a\u8fc7\u638c\u63e1\u8fd9\u4e9b\u65b9\u6cd5\uff0c\u6211\u4eec\u53ef\u4ee5\u8f7b\u677e\u5e94\u5bf9\u4e2d\u6587\u6587\u672c\u5904\u7406\u4e2d\u7684\u5404\u79cd\u6311\u6218\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u4f7f\u7528Python\u5bf9\u4e2d\u6587\u6587\u672c\u8fdb\u884c\u7b5b\u9009\uff1f<\/strong><br \/>\u5728Python\u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u5e93\uff08re\uff09\u6765\u7b5b\u9009\u4e2d\u6587\u5b57\u7b26\u3002\u901a\u8fc7\u5b9a\u4e49\u6b63\u5219\u8868\u8fbe\u5f0f\uff0c\u53ef\u4ee5\u63d0\u53d6\u51fa\u5305\u542b\u4e2d\u6587\u7684\u5b57\u7b26\u4e32\u3002\u4f8b\u5982\uff0c\u4f7f\u7528<code>re.findall(r&#39;[\\u4e00-\\u9fa5]+&#39;, text)<\/code>\u53ef\u4ee5\u627e\u5230\u6587\u672c\u4e2d\u6240\u6709\u7684\u4e2d\u6587\u5b57\u7b26\u3002<\/p>\n<p><strong>Python\u4e2d\u6709\u54ea\u4e9b\u5e93\u53ef\u4ee5\u5e2e\u52a9\u5904\u7406\u4e2d\u6587\u6587\u672c\uff1f<\/strong><br \/>\u5bf9\u4e8e\u4e2d\u6587\u6587\u672c\u7684\u5904\u7406\uff0c\u53ef\u4ee5\u8003\u8651\u4f7f\u7528\u4e00\u4e9b\u6d41\u884c\u7684\u5e93\uff0c\u5982jieba\u7528\u4e8e\u4e2d\u6587\u5206\u8bcd\uff0cPandas\u7528\u4e8e\u6570\u636e\u5904\u7406\uff0cBeautifulSoup\u7528\u4e8e\u89e3\u6790HTML\u7b49\u3002\u8fd9\u4e9b\u5e93\u80fd\u591f\u6709\u6548\u5730\u5e2e\u52a9\u4f60\u7b5b\u9009\u548c\u5904\u7406\u4e2d\u6587\u5185\u5bb9\uff0c\u63d0\u9ad8\u5de5\u4f5c\u6548\u7387\u3002<\/p>\n<p><strong>\u5982\u4f55\u786e\u4fdd\u7b5b\u9009\u51fa\u7684\u4e2d\u6587\u6587\u672c\u7684\u51c6\u786e\u6027\uff1f<\/strong><br \/>\u4e3a\u4e86\u4fdd\u8bc1\u7b5b\u9009\u51fa\u7684\u4e2d\u6587\u6587\u672c\u7684\u51c6\u786e\u6027\uff0c\u53ef\u4ee5\u7ed3\u5408\u4e0a\u4e0b\u6587\u4fe1\u606f\u8fdb\u884c\u5206\u6790\u3002\u4f7f\u7528\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u6280\u672f\uff0c\u5982\u60c5\u611f\u5206\u6790\u548c\u4e3b\u9898\u5efa\u6a21\uff0c\u53ef\u4ee5\u5e2e\u52a9\u8fdb\u4e00\u6b65\u8fc7\u6ee4\u548c\u9a8c\u8bc1\u6587\u672c\u7684\u76f8\u5173\u6027\u548c\u51c6\u786e\u6027\u3002\u6b64\u5916\uff0c\u786e\u4fdd\u4f7f\u7528\u7684\u6b63\u5219\u8868\u8fbe\u5f0f\u6216\u5206\u8bcd\u65b9\u6cd5\u7b26\u5408\u5177\u4f53\u5e94\u7528\u573a\u666f\u7684\u9700\u6c42\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u5728Python\u4e2d\u8fdb\u884c\u5bf9\u4e2d\u6587\u7684\u7b5b\u9009\u6709\u591a\u79cd\u65b9\u6cd5\uff0c\u4ee5\u4e0b\u662f\u51e0\u79cd\u5e38\u89c1\u7684\u65b9\u5f0f\uff1a\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u3001\u5229\u7528Unicode\u8303\u56f4\u3001\u7ed3\u5408 [&hellip;]","protected":false},"author":3,"featured_media":1045652,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1045626"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=1045626"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1045626\/revisions"}],"predecessor-version":[{"id":1045654,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1045626\/revisions\/1045654"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/1045652"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=1045626"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=1045626"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=1045626"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}