{"id":267920,"date":"2024-05-15T17:41:31","date_gmt":"2024-05-15T09:41:31","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/267920.html"},"modified":"2024-05-15T17:41:39","modified_gmt":"2024-05-15T09:41:39","slug":"%e6%9c%89c%e7%bc%96%e5%86%99%e7%9a%84%e7%bd%91%e7%bb%9c%e7%88%ac%e8%99%ab%e7%9a%84%e7%a4%ba%e4%be%8b%e4%bb%a3%e7%a0%81%e5%90%97","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/267920.html","title":{"rendered":"\u6709C#\u7f16\u5199\u7684\u7f51\u7edc\u722c\u866b\u7684\u793a\u4f8b\u4ee3\u7801\u5417"},"content":{"rendered":"<p style=\"text-align:center\"><img decoding=\"async\" src=\"https:\/\/cdn-kb.worktile.com\/kb\/wp-content\/uploads\/2024\/04\/27133112\/73b276d2-c9f6-4f4f-b39b-cf5d90f6bb11.webp\" alt=\"\u6709C#\u7f16\u5199\u7684\u7f51\u7edc\u722c\u866b\u7684\u793a\u4f8b\u4ee3\u7801\u5417\" \/><\/p>\n<p><p>C#\u7f16\u5199\u7684\u7f51\u7edc\u722c\u866b\u793a\u4f8b\u4ee3\u7801\u4e3b\u8981\u4f9d\u8d56\u4e8e<strong>HttpClient<\/strong>\u3001<strong>HtmlAgilityPack<\/strong>\u3001<strong>\u6b63\u5219\u8868\u8fbe\u5f0f<\/strong>\u548c<strong>\u591a\u7ebf\u7a0b<\/strong>\u6280\u672f\u3002\u4e0b\u9762\u662f\u4e00\u4e2a\u7b80\u5355\u7684\u7f51\u7edc\u722c\u866b\u793a\u4f8b\uff1a\u9996\u5148\uff0c\u4f7f\u7528<strong>HttpClient<\/strong>\u53d1\u9001\u8bf7\u6c42\u83b7\u53d6\u7f51\u9875\u5185\u5bb9\uff1b\u7136\u540e\uff0c\u4f7f\u7528<strong>HtmlAgilityPack<\/strong>\u89e3\u6790HTML\u6587\u6863\uff0c\u62bd\u53d6\u6240\u9700\u6570\u636e\uff1b\u63a5\u4e0b\u6765\uff0c\u5229\u7528<strong>\u6b63\u5219\u8868\u8fbe\u5f0f<\/strong>\u8fdb\u4e00\u6b65\u7b5b\u9009\u4fe1\u606f\uff1b\u6700\u540e\uff0c\u53ef\u4ee5\u8fd0\u7528<strong>\u591a\u7ebf\u7a0b<\/strong>\u6280\u672f\u63d0\u5347\u722c\u866b\u6548\u7387\u4e0e\u6027\u80fd\u3002\u8fd9\u6837\u7684\u722c\u866b\u80fd\u591f\u6709\u6548\u5730\u4ece\u7f51\u9875\u4e2d\u63d0\u53d6\u6570\u636e\uff0c\u5e76\u4e3a\u5404\u79cd\u7528\u9014\u5982\u6570\u636e\u5206\u6790\u3001\u4fe1\u606f\u805a\u5408\u7b49\u63d0\u4f9b\u539f\u6750\u6599\u3002<\/p>\n<\/p>\n<p><p>\u4e00\u3001HTTPCLIENT\u7684\u4f7f\u7528<\/p>\n<\/p>\n<p><p>\u9996\u5148\uff0c\u8ba9\u6211\u4eec\u4e86\u89e3\u4e00\u4e0b<strong>HttpClient<\/strong>\u7c7b\u7684\u4f7f\u7528\u3002\u5728.NET\u4e2d\uff0c<strong>HttpClient<\/strong>\u662f\u4e00\u4e2a\u975e\u5e38\u5f3a\u5927\u7684\u5e93\uff0c\u7528\u4e8e\u53d1\u9001HTTP\u8bf7\u6c42\u548c\u63a5\u6536\u54cd\u5e94\u3002\u5b83\u662f\u6784\u5efa\u7f51\u7edc\u722c\u866b\u7684\u57fa\u7840\u5de5\u5177\u3002\u521b\u5efa\u4e00\u4e2aHttpClient\u5b9e\u4f8b\uff0c\u914d\u7f6e\u8bf7\u6c42\u5934\uff0c\u4f8b\u5982User-Agent\u6765\u6a21\u62df\u6d4f\u89c8\u5668\u884c\u4e3a\uff0c\u5e76\u53d1\u9001GET\u6216\u8005POST\u8bf7\u6c42\u81f3\u76ee\u6807URL\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-csharp\">using System.Net.Http;<\/p>\n<p>using System.Threading.Tasks;<\/p>\n<p>public class WebCrawler<\/p>\n<p>{<\/p>\n<p>    private readonly HttpClient _client;<\/p>\n<p>    public WebCrawler()<\/p>\n<p>    {<\/p>\n<p>        _client = new HttpClient();<\/p>\n<p>        _client.DefaultRequestHeaders.Add(&quot;User-Agent&quot;, &quot;Mozilla\/5.0 (compatible; CrawlerBot\/1.0)&quot;);<\/p>\n<p>    }<\/p>\n<p>    public async Task&lt;string&gt; FetchPageAsync(string url)<\/p>\n<p>    {<\/p>\n<p>        HttpResponseMessage response = aw<a href=\"https:\/\/docs.pingcode.com\/blog\/59162.html\" target=\"_blank\">AI<\/a>t _client.GetAsync(url);<\/p>\n<p>        response.EnsureSuccessStatusCode();<\/p>\n<p>        string content = await response.Content.ReadAsStringAsync();<\/p>\n<p>        return content;<\/p>\n<p>    }<\/p>\n<p>}<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u4e8c\u3001HTMLAGILITYPACK\u7684\u5e94\u7528<\/p>\n<\/p>\n<p><p>\u4e0b\u4e00\u6b65\u662f\u89e3\u6790HTML\u6587\u6863\u3002<strong>HtmlAgilityPack<\/strong>\u662f\u4e00\u4e2a\u5f3a\u5927\u7684.NET\u5e93\uff0c\u7528\u4e8e\u89e3\u6790\u548c\u64cd\u4f5cHTML\u6587\u6863\u3002\u5b83\u63d0\u4f9b\u4e86XPath\u548cCss\u9009\u62e9\u5668\u652f\u6301\uff0c\u4f7f\u5f97\u4ece\u590d\u6742\u7684HTML\u7ed3\u6784\u4e2d\u63d0\u53d6\u6570\u636e\u53d8\u5f97\u5bb9\u6613\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-csharp\">using HtmlAgilityPack;<\/p>\n<p>using System;<\/p>\n<p>using System.Linq;<\/p>\n<p>using System.Net.Http;<\/p>\n<p>using System.Threading.Tasks;<\/p>\n<p>public class HtmlParser<\/p>\n<p>{<\/p>\n<p>    private WebCrawler _crawler;<\/p>\n<p>    public HtmlParser(WebCrawler crawler)<\/p>\n<p>    {<\/p>\n<p>        _crawler = crawler;<\/p>\n<p>    }<\/p>\n<p>    public async Task ParseAsync(string url)<\/p>\n<p>    {<\/p>\n<p>        string pageContent = await _crawler.FetchPageAsync(url);<\/p>\n<p>        var htmlDoc = new HtmlDocument();<\/p>\n<p>        htmlDoc.LoadHtml(pageContent);<\/p>\n<p>        var nodes = htmlDoc.DocumentNode.SelectNodes(&quot;\/\/a[@href]&quot;);<\/p>\n<p>        foreach (var node in nodes)<\/p>\n<p>        {<\/p>\n<p>            string hrefValue = node.GetAttributeValue(&quot;href&quot;, string.Empty);<\/p>\n<p>            Console.WriteLine($&quot;Found link: {hrefValue}&quot;);<\/p>\n<p>            \/\/ \u5904\u7406\u627e\u5230\u7684\u94fe\u63a5...<\/p>\n<p>        }<\/p>\n<p>    }<\/p>\n<p>}<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u4e09\u3001\u6b63\u5219\u8868\u8fbe\u5f0f\u7684\u8fd0\u7528<\/p>\n<\/p>\n<p><p>\u6709\u65f6HTML\u6587\u6863\u4e2d\u7684\u6570\u636e\u5206\u6563\u4e14\u683c\u5f0f\u6df7\u4e71\uff0c\u8fd9\u65f6\u53ef\u4ee5\u4f7f\u7528<strong>\u6b63\u5219\u8868\u8fbe\u5f0f<\/strong>\u6765\u63d0\u53d6\u4fe1\u606f\u3002\u6b63\u5219\u8868\u8fbe\u5f0f\u662f\u5b9a\u4e49\u641c\u7d22\u6a21\u5f0f\u7684\u5b57\u7b26\u4e32\uff0c\u7528\u4e8e\u6587\u672c\u641c\u7d22\u548c\u590d\u6742\u5b57\u7b26\u4e32\u64cd\u4f5c\u3002 <\/p>\n<\/p>\n<p><pre><code class=\"language-csharp\">using System;<\/p>\n<p>using System.Text.RegularExpressions;<\/p>\n<p>public class RegexExtractor<\/p>\n<p>{<\/p>\n<p>    public void ExtractEmails(string text)<\/p>\n<p>    {<\/p>\n<p>        Regex emailRegex = new Regex(@&quot;[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+&quot;, RegexOptions.Compiled);<\/p>\n<p>        var matches = emailRegex.Matches(text);<\/p>\n<p>        foreach (Match match in matches)<\/p>\n<p>        {<\/p>\n<p>            Console.WriteLine($&quot;Found email: {match.Value}&quot;);<\/p>\n<p>            \/\/ \u5904\u7406\u5339\u914d\u5230\u7684\u7535\u5b50\u90ae\u4ef6...<\/p>\n<p>        }<\/p>\n<p>    }<\/p>\n<p>}<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u56db\u3001\u591a\u7ebf\u7a0b\u6280\u672f\u7684\u5e94\u7528<\/p>\n<\/p>\n<p><p>\u4e3a\u4e86\u63d0\u9ad8\u722c\u866b\u7684\u6548\u7387\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528<strong>\u591a\u7ebf\u7a0b<\/strong>\u6216<strong>\u5f02\u6b65\u7f16\u7a0b<\/strong>\u6280\u672f\u3002\u8fd9\u5728\u5904\u7406\u5927\u91cf\u8bf7\u6c42\u65f6\u5c24\u4e3a\u91cd\u8981\uff0c\u53ef\u4ee5\u6709\u6548\u5730\u63d0\u5347\u722c\u866b\u7684\u6027\u80fd\u3002<\/p>\n<\/p>\n<p><pre><code class=\"language-csharp\">using System;<\/p>\n<p>using System.Collections.Generic;<\/p>\n<p>using System.Threading.Tasks;<\/p>\n<p>public class MultiThreadedCrawler<\/p>\n<p>{<\/p>\n<p>    private HtmlParser _parser;<\/p>\n<p>    public MultiThreadedCrawler(HtmlParser parser)<\/p>\n<p>    {<\/p>\n<p>        _parser = parser;<\/p>\n<p>    }<\/p>\n<p>    public async Task StartCrawling(IEnumerable&lt;string&gt; urls)<\/p>\n<p>    {<\/p>\n<p>        var tasks = new List&lt;Task&gt;();<\/p>\n<p>        foreach (string url in urls)<\/p>\n<p>        {<\/p>\n<p>            tasks.Add(Task.Run(() =&gt; _parser.ParseAsync(url)));<\/p>\n<p>        }<\/p>\n<p>        await Task.WhenAll(tasks);<\/p>\n<p>        Console.WriteLine(&quot;Crawling completed!&quot;);<\/p>\n<p>    }<\/p>\n<p>}<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><p>\u7ed3\u5408\u8fd9\u4e9b\u6280\u672f\u548c\u5b9e\u4f8b\u4ee3\u7801\uff0c\u4f60\u53ef\u4ee5\u6784\u5efa\u4e00\u4e2a\u7b80\u5355\u4f46\u529f\u80fd\u5f3a\u5927\u7684\u7f51\u7edc\u722c\u866b\u6765\u6293\u53d6\u548c\u5206\u6790\u7f51\u7edc\u6570\u636e\u3002 \u8bb0\u5f97\u5728\u4f7f\u7528\u7f51\u7edc\u722c\u866b\u65f6\u9075\u5b88\u7f51\u7ad9\u7684robots.txt\u89c4\u5219\uff0c\u4e0d\u8981\u5bf9\u670d\u52a1\u5668\u9020\u6210\u8fc7\u5927\u538b\u529b\uff0c\u5408\u7406\u5b89\u6392\u722c\u53d6\u9891\u7387\u548c\u65f6\u95f4\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p><strong>Q: C#\u7f16\u5199\u7684\u7f51\u7edc\u722c\u866b\u793a\u4f8b\u4ee3\u7801\u5728\u54ea\u91cc\u53ef\u4ee5\u627e\u5230\uff1f<\/strong><br \/>\nA: \u5728GitHub\u4e0a\u53ef\u4ee5\u627e\u5230\u5f88\u591aC#\u7f16\u5199\u7684\u7f51\u7edc\u722c\u866b\u793a\u4f8b\u4ee3\u7801\u3002\u4f60\u53ef\u4ee5\u5728GitHub\u7684\u4ee3\u7801\u5e93\u641c\u7d22&quot;web crawler&quot;\u6216\u8005 &quot;C# web crawler&quot;\u6765\u627e\u5230\u8bb8\u591a\u6709\u7528\u7684\u793a\u4f8b\u4ee3\u7801\u3002<\/p>\n<p><strong>Q: \u5982\u4f55\u4f7f\u7528C#\u7f16\u5199\u4e00\u4e2a\u7b80\u5355\u7684\u7f51\u7edc\u722c\u866b\uff1f<\/strong><br \/>\nA: \u7f16\u5199\u4e00\u4e2a\u7b80\u5355\u7684C#\u7f51\u7edc\u722c\u866b\u53ef\u4ee5\u901a\u8fc7\u4f7f\u7528HttpClient\u7c7b\u6765\u5b9e\u73b0\u3002\u9996\u5148\uff0c\u4f60\u9700\u8981\u786e\u5b9a\u4f60\u8981\u722c\u53d6\u7684\u7f51\u7ad9\uff0c\u5e76\u4f7f\u7528HttpClient\u53d1\u9001HTTP\u8bf7\u6c42\u6765\u83b7\u53d6\u7f51\u9875\u7684\u5185\u5bb9\u3002\u7136\u540e\uff0c\u4f60\u53ef\u4ee5\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u6216\u8005HTML\u89e3\u6790\u5668\u6765\u89e3\u6790\u7f51\u9875\u5185\u5bb9\uff0c\u63d0\u53d6\u4f60\u9700\u8981\u7684\u6570\u636e\uff0c\u5e76\u8fdb\u4e00\u6b65\u5904\u7406\u8fd9\u4e9b\u6570\u636e\u3002<\/p>\n<p><strong>Q: \u6709\u6ca1\u6709\u9002\u7528\u4e8e\u521d\u5b66\u8005\u7684C#\u7f51\u7edc\u722c\u866b\u6559\u7a0b\uff1f<\/strong><br \/>\nA: \u662f\u7684\uff0c\u6709\u5f88\u591a\u9002\u5408\u521d\u5b66\u8005\u7684C#\u7f51\u7edc\u722c\u866b\u6559\u7a0b\u53ef\u4f9b\u53c2\u8003\u3002\u4f60\u53ef\u4ee5\u5728\u7f51\u4e0a\u641c\u7d22&quot;C#\u7f51\u7edc\u722c\u866b\u6559\u7a0b&quot;\u6765\u627e\u5230\u4e00\u4e9b\u89c6\u9891\u6559\u7a0b\u548c\u535a\u5ba2\u6587\u7ae0\uff0c\u8fd9\u4e9b\u6559\u7a0b\u4f1a\u9010\u6b65\u5f15\u5bfc\u4f60\u4ece\u57fa\u7840\u5f00\u59cb\u5b66\u4e60\u5982\u4f55\u7f16\u5199\u4e00\u4e2a\u7b80\u5355\u7684C#\u7f51\u7edc\u722c\u866b\u3002\u4e00\u4e9b\u6559\u7a0b\u8fd8\u4f1a\u6db5\u76d6\u66f4\u9ad8\u7ea7\u7684\u4e3b\u9898\uff0c\u5982\u5982\u4f55\u5904\u7406JavaScript\u6e32\u67d3\u7684\u7f51\u9875\u548c\u5982\u4f55\u5e94\u5bf9\u53cd\u722c\u866b\u673a\u5236\u7b49\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"C#\u7f16\u5199\u7684\u7f51\u7edc\u722c\u866b\u793a\u4f8b\u4ee3\u7801\u4e3b\u8981\u4f9d\u8d56\u4e8eHttpClient\u3001HtmlAgilityPack\u3001\u6b63\u5219\u8868\u8fbe\u5f0f\u548c\u591a\u7ebf\u7a0b\u6280 [&hellip;]","protected":false},"author":3,"featured_media":267923,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/267920"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=267920"}],"version-history":[{"count":0,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/267920\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/267923"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=267920"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=267920"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=267920"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}