Python 爬虫爬取 chrome webstore
插件本地存放目录:
~/Library/Application Support/Google/Chrome/Default/Extensions
插件地址 = 'https://chrome.google.com/webstore/detail/' + 插件存放目录里的文件夹名
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| import socket import socks import requests from bs4 import BeautifulSoup import time
SOCKS5_PROXY_HOST = '127.0.0.1' SOCKS5_PROXY_PORT = 1086 default_socket = socket.socket socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT) socket.socket = socks.socksocket
baseUrl = 'https://chrome.google.com/webstore/detail/'
extensions = [ 'aabcgdmkeabbnleenpncegpcngjpnjkc', 'aajodjghehmlpahhboidcpfjcncmcklf', 'aalnjolghjkkogicompabhhbbkljnlka' ]
for extension in extensions: url = baseUrl + extension html_source = requests.get(url, headers={ "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"}).text if html_source: try: soup = BeautifulSoup(html_source, 'html.parser') title = soup.html.find_all("title")[0].text print('### [' + title + ']' + '(' + url + " '0.0'" + ')') print(soup.find_all("pre", class_="C-b-p-j-Oa")[0].text) except: print('fail for: ' + '[' + url + ']') else: print('ok') else: print('fail for: ' + '[' + url + ']') time.sleep(4)
|
参考: