{"id":44,"date":"2018-11-18T18:48:20","date_gmt":"2018-11-18T10:48:20","guid":{"rendered":"http:\/\/47.107.162.66\/?p=44"},"modified":"2020-01-06T20:51:26","modified_gmt":"2020-01-06T12:51:26","slug":"lib-of-requests-bs4","status":"publish","type":"post","link":"https:\/\/harson.co\/index.php\/2018\/11\/18\/lib-of-requests-bs4\/","title":{"rendered":"Lib of Requests &#038; bs4"},"content":{"rendered":"<h1>Requests\u5e93<\/h1>\n<h3>\u00a0 \u00a0 \u00a0\u521b\u5efa\u4e00\u4e2arequests\u7c7b\u7684\u5b9e\u4f8b\u5316\u5bf9\u8c61r, \u5e76\u4f7f\u7528get\u65b9\u6cd5\u8bbf\u95eeurl(&#8220;https:\/\/pintia.cn\/&#8221;)<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">r = requests.get(\"https:\/\/pintia.cn\/\")<\/code><\/pre>\n<h3>\u00a0 \u00a0 \u4ee5\u7279\u5b9a\u65b9\u5f0f\u8bbf\u95ee\u683c\u5f0f:<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">requests.request('method',url,**kwargs)\r\n\u4f7f\u7528\u65b9\u6cd5\uff1a\r\n     r = requests.request('GET',url,**kwargs)\r\n     **kwargs:(\u661f\u661f\u5f00\u5934\u4e3a\u53ef\u9009\u53c2\u6570\uff0c\u5e38\u7528\u53c2\u6570\u5982\u4e0b\uff1a)  sthdic == something dictionary\u9700\u8981\u5b57\u5178\u4f5c\u4e3a\u53c2\u6570\r\n\r\n     params = sthdic = {key:value}-----\u5c06\u5b57\u5178\u4e2d\u5b57\u7b26\u4e32\u4f5c\u4e3a\u53c2\u6570\u586b\u5230url\u4e2d *get\u65b9\u6cd5\r\n     data = sthdic ------\u5b57\u5178,\u5b57\u7b26\u4e32\u4f5c\u4e3a\u53c2\u6570\u4f5c\u4e3aRequest\u7684\u5185\u5bb9\r\n     headers = sthdic -----\u5b9a\u5236\u5934\r\n     coolkies\r\n     file<\/code><\/pre>\n<h3>\u8fd4\u56deHTTP\u72b6\u6001\u7801<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">r.status_code<\/code><\/pre>\n<h3>\u7f16\u7801\u65b9\u5f0f<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">r.encoding #\u6839\u636e\u670d\u52a1\u5668\u53d1\u9001\u7684\u4fe1\u606f\u5224\u65ad\u7f16\u7801\uff0c\u9ed8\u8ba4\u4e3aISO-\u5565\u5565\r\nr.apparent_encoding #\u6839\u636e\u7f51\u9875\u5185\u5bb9\u5224\u65ad\u7f16\u7801\r\n#\u4e00\u52b3\u6c38\u9038\uff0c\u6052\u6839\u636e\u7f51\u9875\u5185\u5bb9\u5224\u65ad\u7f16\u7801\r\nr.encoding = r.apparent_encoding<\/code><\/pre>\n<h3>\u5178\u578b\u722c\u53d6\u6848\u4f8b<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">def getHTMLText(url):\r\n\ttry:\r\n\t\tr = requests.get(url, timeout=30)\r\n                r.raise_for_status() \r\n\t\t#\u5982\u679cHTTP\u72b6\u6001\u4e0d\u662f200 \u5219\u5f15\u53d1HTTPError\u5f02\u5e38\r\n                r.encoding = r.apparent_encoding\r\n\t\tr.encoding = 'UTF-8'\r\n\t\treturn r.text\r\n\texcept:\r\n\t\treturn \"error!\"\r\n\r\n#\u7f51\u7edc\u8fde\u63a5\u4e0d\u4e00\u5b9a\u7a33\u5b9a\uff0c\u9700\u8981\u6355\u83b7\u5f02\u5e38<\/code><\/pre>\n<h3>\u5b9a\u5236\u5934<\/h3>\n<div>\n<pre class=\"pure-highlightjs\"><code class=\"\">headers = { 'User-Agent':'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) \r\nChrome\/52.0.2743.116 Safari\/537.36 Edge\/15.15063'}\r\n#\u4ee5\u5b57\u5178\u5f62\u5f0f\uff0c\u4f7f\u7528\r\nrequests.get(\"https:\/\/pintia.cn\/\", headers = headers)<\/code><\/pre>\n<\/div>\n<div>\n<h1>bs4\u5e93<\/h1>\n<h3><\/h3>\n<h3>\u00a0 \u00a0 Requests \u5e93\u7528\u6765\u83b7\u53d6\u7f51\u9875\u4fe1\u606f<\/h3>\n<h3>\u00a0 \u00a0 \u800c BeautifulSoup4\u5e93\u662f\u7528\u6765\u89e3\u6790HTML\u6587\u4ef6\u7684\u5e93\uff0c\u7b80\u79f0bs4\u3002<\/h3>\n<h4>\u00a0 \u00a0 \u00a0\u9700\u8981\u5982\u6b64\u5bfc\u5165(\u4ecebs4\u5e93\u4e2d\u5bfc\u5165BeautifulSoup\u7c7b)<\/h4>\n<\/div>\n<div>\n<pre class=\"pure-highlightjs\"><code class=\"\">from bs4 import BeautifulSoup<\/code><\/pre>\n<\/div>\n<p>BeautifulSoup\u7c7b\u53ea\u80fd\u89e3\u6790\u6587\u672c\uff0c\u56e0\u6b64\u9700\u8981requests\u5e93\u83b7\u53d6\u7f51\u9875\u6587\u672c\uff0c\u4e0b\u6587\u5047\u8bbe test_html\u53d8\u91cf \u5df2\u5305\u542b\u4e00\u4e2a\u7f51\u9875\u5185\u5bb9<\/p>\n<h3>\u00a0 \u00a0\u89e3\u6790\u8be5\u9875\u9762\u5185\u5bb9<\/h3>\n<div>\n<pre class=\"pure-highlightjs\"><code class=\"\">soup = BeautifulSoup (test_html, 'html.parser')    # html.parser\u4e3abs\u7684\u89e3\u6790\u5668<\/code><\/pre>\n<\/div>\n<h3>\u00a0 \u00a0\u83b7\u53d6\u6807\u7b7e(tag)\u5185\u5bb9<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">soup.tag #\u53ef\u4ee5\u7528\u6765\u83b7\u53d6HTML\u9875\u9762\u7684\u4efb\u4e00\u4e2a\u6807\u7b7e\r\n#\u5982:\r\nsoup.title #\u53ef\u83b7\u53d6\u5f53\u524d\u7f51\u9875\u7684\u6807\u7b7e\u540d\r\n#\u5982\u679c\u8be5\u6709\u591a\u4e2a\u6807\u7b7e\uff0csoup.tag \u8fd4\u56de\u7b2c\u4e00\u4e2a<\/code><\/pre>\n<h3>\u00a0 \u83b7\u53d6\u6807\u7b7e(tag)\u540d\u5b57<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">soup.a.parent.name  #\u901a\u5e38\u7528\u4e8e\u83b7\u53d6\u7236\u7c7b\u542b\u6709\u4ec0\u4e48\u6807\u7b7e<\/code><\/pre>\n<h3>\u00a0 \u83b7\u53d6tag\u7684\u5c5e\u6027(attrs)<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">soup.a.attrs #\u8fd4\u56de a \u6807\u7b7e\u5c16\u62ec\u53f7\u91cc\u7684\u5185\u5bb9\uff0c\u8fd4\u56de\u7c7b\u578b\u4e3a\u5b57\u5178<\/code><\/pre>\n<h3>\u00a0 \u00a0\u83b7\u53d6tag\u7684\u663e\u793a\u5728\u7f51\u9875\u4e2d\u7684\u6587\u672c\u5185\u5bb9<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">soup.a.string<\/code><\/pre>\n<h2>\u6807\u7b7e\u6811\u7684\u4e0b\u884c\u904d\u5386<\/h2>\n<pre class=\"pure-highlightjs\"><code class=\"\">for child in soup.body.children:\r\n    print(child)    #\u904d\u5386body\u6807\u7b7e\u7684\u5b50\u6807\u7b7e\r\n\r\nfor child in soup.body.descendants:\r\n\u200b    print(child)   #\u904d\u5386body\u6807\u7b7e\u6240\u6709\u7684\u5b50\u5b59\u6807\u7b7e\uff08\u8fed\u4ee3\u904d\u5386\u5b50\u6807\u7b7e\uff09<\/code><\/pre>\n<h2>\u6807\u7b7e\u6811\u7684\u4e0a\u884c\u904d\u5386<\/h2>\n<pre class=\"pure-highlightjs\"><code class=\"\">.parent  #\u8be5\u6807\u7b7e\u7684\u4e0a\u4e00\u4e2a\u7236\u6807\u7b7e\r\n.parents #\u8be5\u6807\u7b7e\u7684\u6240\u6709\u8fed\u4ee3\u7236\u8f88\u6807\u7b7e\uff0c\u5982\u679c\u5df2\u662f\u6700\u4e0a\u7ea7\u6807\u7b7e\uff0c\u4f7f\u7528\u8be5\u65b9\u6cd5\u65f6\u4f1a\u8fd4\u56de None\r\n\r\n\r\nfor parent in soup.a.parents:\r\n\u200b    if parent is None:\r\n\u200b        print(parent)\r\n\u200b    else\r\n\u200b        print(parent.name)<\/code><\/pre>\n<p>&nbsp;<\/p>\n<h2>\u6807\u7b7e\u6811\u7684\u5e73\u884c\u904d\u5386<\/h2>\n<pre class=\"pure-highlightjs\"><code class=\"\">.next_sibling                  #\u8fd4\u56de\u6309\u7167HTML\u6587\u672c\u987a\u5e8f\u7684\u4e0b\u4e00\u4e2a\u5e73\u884c\u8282\u70b9\u6807\u7b7e\r\n.previous_sibling              #\u8fd4\u56de\u6309\u7167HTML\u6587\u672c\u987a\u5e8f\u7684\u4e0a\u4e00\u4e2a\u5e73\u884c\u8282\u70b9\u6807\u7b7e\r\n.next_siblings                 #\u8fed\u4ee3\u7c7b\u578b\uff0c\u8fd4\u56de\u6309\u7167HTML\u6587\u672c\u987a\u5e8f\u7684\u540e\u7eed\u6240\u6709\u5e73\u884c\u8282\u70b9\u6807\u7b7e\r\n.previous_siblings             #\u8fed\u4ee3\u7c7b\u578b\uff0c\u8fd4\u56de\u6309\u7167HTML\u6587\u672c\u987a\u5e8f\u7684\u524d\u7eed\u6240\u6709\u5e73\u884c\u8282\u70b9\u6807\u7b7e\r\n\r\nfor sibling in soup.a.next_siblings:\r\n\u200b    print(sibling)       #\u6240\u6709\u540e\u7eed\u6807\u7b7e\r\nfor sibling in soup.a.previous_siblings\r\n\u200b    print(sibling)       #\u6240\u6709\u524d\u7eed\u6807\u7b7e\r\n\r\n#\u5e73\u884c\u904d\u5386\u6807\u7b7e\u5fc5\u987b\u5728\u540c\u4e00\u7236\u6807\u7b7e\u4e0b<\/code><\/pre>\n<h3>find_all()<\/h3>\n<p>tag(&#8230;) \u7b49\u4ef7\u4e8e tag.find_all(&#8230;)<\/p>\n<p>soup(&#8230;) \u7b49\u4ef7\u4e8e soup.find_fall(&#8230;)<\/p>\n<h3>\u68c0\u7d22<\/h3>\n<pre class=\"pure-highlightjs\"><code class=\"\">soup.find_all('a')\r\nsoup.find_all(['a','b'])   #\u68c0\u7d22\u591a\u4e2a\u6807\u7b7e\r\nsoup.find_all(id='link1')  #\u68c0\u7d22id\r\nsoup.find_all(attrs = {\"class\":\"course\"})   #\u68c0\u7d22\u5c5e\u6027\r\nsoup.find_all(string = 'This is a sample')  #\u68c0\u7d22\u6587\u672c<\/code><\/pre>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Requests\u5e93 \u00a0 \u00a0 \u00a0\u521b\u5efa\u4e00\u4e2arequests\u7c7b\u7684\u5b9e\u4f8b\u5316\u5bf9\u8c61r, \u5e76\u4f7f\u7528get\u65b9\u6cd5\u8bbf\u95eeurl(&#038;#82 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[9],"tags":[],"class_list":["post-44","post","type-post","status-publish","format-standard","hentry","category-python"],"_links":{"self":[{"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/posts\/44","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/comments?post=44"}],"version-history":[{"count":4,"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/posts\/44\/revisions"}],"predecessor-version":[{"id":436,"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/posts\/44\/revisions\/436"}],"wp:attachment":[{"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/media?parent=44"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/categories?post=44"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/harson.co\/index.php\/wp-json\/wp\/v2\/tags?post=44"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}