正则表达式(regular expression regrx{RE})
a way to express string more effectively
- 通用的字符串表达框架
- 简介表达一组字符串的表达式
- 判断某字符串的特征属性
主要用于字符串匹配

RE的语法




Re库

好吧,用了这么久python,这里终于懂了。。r是啥原理的


match()和search()的区别:
- match()函数只检测RE是不是在string的开始位置匹配,
- search()会扫描整个string查找匹配
- match()只有在0位置匹配成功的话才有返回,如果不是开始位置匹配成功的话,match()就返回none
split默认返回所有匹配内容剩下的东西~


complie之后更方便多次调用
Re库的match对象

贪婪匹配及最小匹配


淘宝商品信息爬取实例

淘宝引进了反扒机制,这里要加上自己的cookie和header利用浏览器开发者工具。详情参考第二个参考文献~
# -*- coding: utf-8 -*- """ Created on Sun Aug 30 22:19:44 2020 @author: 99488 """ import requests import re def getHTMLText(url): try: kv={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'} cook_dict={'cookie':'cna=OHYzFlxpCgQCAUe6kNSsqoje; thw=ca; hng=US%7Czh-CN%7CUSD%7C840; miid=1759477185591050612; t=86ef6455e34fa74ef8073b0294ba59c4; _fbp=fb.1.1597175403806.1228562458; sgcookie=EwLwA56IkyVfwkAYsuIji; uc3=vt3=F8dBxG2nhNGf6ANmj2E%3D&nk2=saDQmvYSSNhcwz6%2Ftu1PMg%3D%3D&id2=UUtIF0HlMdX9gg%3D%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; lgc=%5Cu4E00%5Cu751F%5Cu4E00%5Cu4E16%5Cu77E2%5Cu5FD7%5Cu4E0D%5Cu6E1D; uc4=id4=0%40U2lyjfkOdcEEGNaGFcBY53X4unzv&nk4=0%40s8WQPlk07CQNbiZZSRanHDOHZLmOlqprLVZG; tracknick=%5Cu4E00%5Cu751F%5Cu4E00%5Cu4E16%5Cu77E2%5Cu5FD7%5Cu4E0D%5Cu6E1D; _cc_=VT5L2FSpdA%3D%3D; enc=wQ03VR5YiiTpokMPWtaWQTijCIuU411CeGWlHZG9Fm3J1QTjQijDJmLiXPjyZ2oZ%2FsGI3oxoV8FEOKaBEmWPiA%3D%3D; mt=ci=-1_0; xlly_s=1; cookie2=13a2bbd859744d81891b5cd6ea56f77b; _tb_token_=b30e880d5ee8; v=0; _m_h5_tk=39bb97b42dc8e95ae09a91c97c52dcca_1598914400674; _m_h5_tk_enc=4af73e4605197a7a0d9bf93ad109ad2e; uc1=cookie14=UoTV5OMV2j5nnw%3D%3D; isg=BCkpBsqYUhWVx2-0KyrRS3jjONWD9h0oH_TJWMse_JBPkkikE0bK-CFDUC6kCrVg; l=eBNBEpSrQhx-NqOdBOfZnurza77TTIRfguPzaNbMiOCPO0CH5jdcWZPIxKLMCnMNH6okR3-WSGYkBALpHyhSnxv9-3k_J_4Z3dC..; tfstk=cjI1Bb48kfc1J6aqbOweg-Up3UtVa3lB3P9ACNZzpVO3oxSe9sAo4QwP-ZcovUpC'} cookies = requests.utils.cookiejar_from_dict(cook_dict, cookiejar=None, overwrite=True) rs = requests.Session() rs.cookies = cookies; r=rs.get(url,timeout=30,headers=kv) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return '' def parsePage(ilt,html): try: plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html) tlt=re.findall(r'\"raw_title\"\:\".*?\"',html) sale=re.findall(r'\"view_sales\"\:\"[\d+].*?\"',html) # plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"\,\"pic_url\"\:\".*?\"',html) for i in range(len(plt)): price=eval(plt[i].split(':')[1]) title=eval(tlt[i].split(':')[1]) saleNum=eval(sale[i].split(':')[1]) # item=eval(re.split(':|,',plt[i])) ilt.append([price,title]) except: print('EOF') def printGoodsList(ilt): tplt='{:4}\t{:8}\t{:16}' print(tplt.format('序号','价格','商品名称')) count=0 for g in ilt: count=count+1 print(tplt.format(count,g[0],g[1])) def main(): goods='戒指' depth=2 start_url ='https://s.taobao.com/search?q='+goods infoList=[] for i in range(depth): try: url=start_url+'&s='+str(44*i) html=getHTMLText(url) parsePage(infoList,html) except: continue printGoodsList(infoList) main()

股票定向爬取

教学视频的网页不可用了。。所以自己换些符合要求的网页~

# -*- coding: utf-8 -*- """ Created on Mon Aug 31 19:49:50 2020 @author: 99488 """ import requests from bs4 import BeautifulSoup import traceback import re def getHTMLText(url): try: r=requests.get(url) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return "~" def getStockList(lst,stockURL): html=getHTMLText(stockURL) soup=BeautifulSoup(html,'html.parser') a=soup.find_all('a') for i in a: try: href=i.attrs['href'] match=re.findall(r"\d{6}",href) if len(match)!=0: lst.append(match) except: continue def getStockInfo(lst,stockURL,fpath): for stock in lst: url=stockURL+stock[0] html=getHTMLText(url) try: if html=="": continue infoDict={} soup=BeautifulSoup(html,'html.parser') stockInfo=soup.find('div',attrs={'class':'detail-data'}) name=soup.find_all(attrs={'class':'name'})[0] infoDict.update({'股票名称':name.text.split()[0]}) keyList=stockInfo.find_all('dt') valueList=stockInfo.find_all('dd') for i in range(len(keyList)): key=keyList[i].text val=valueList[i].text infoDict[key]=val with open (fpath,'a',encoding='utf-8') as f: f.write(str(infoDict)+'\n') except: traceback.print_exc() continue def main(): stock_list_url='https://www.banban.cn/gupiao/list_sh.html' stock_info_url='https://www.laohu8.com/stock/' output_file='D:\\master\\learning\\cs\\python_crawler\\stockInfo.txt' slist=[] getStockList(slist,stock_list_url) getStockInfo(slist,stock_info_url,output_file) return "" main()

小的爬虫可以通过直接给出编码方式而非r.encoding=r.apparent_encoding动态解析去节省时间~
reference:
1. https://blog.csdn.net/weixin_38819889/article/details/93846579?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-3.channel_param&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-3.channel_param
2. https://blog.csdn.net/Aelous_dp/article/details/107445147