193 view times

网络爬虫实战

正则表达式(regular expression regrx{RE})

a way to express string more effectively

  1. 通用的字符串表达框架
  2. 简介表达一组字符串的表达式
  3. 判断某字符串的特征属性

主要用于字符串匹配

RE的语法

Re库

好吧,用了这么久python,这里终于懂了。。r是啥原理的

match()和search()的区别:

  • match()函数只检测RE是不是在string的开始位置匹配,
  • search()会扫描整个string查找匹配
  • match()只有在0位置匹配成功的话才有返回,如果不是开始位置匹配成功的话,match()就返回none

split默认返回所有匹配内容剩下的东西~

complie之后更方便多次调用

Re库的match对象

贪婪匹配及最小匹配

淘宝商品信息爬取实例

淘宝引进了反扒机制,这里要加上自己的cookie和header利用浏览器开发者工具。详情参考第二个参考文献~

# -*- coding: utf-8 -*-
"""
Created on Sun Aug 30 22:19:44 2020

@author: 99488
"""

import requests
import re

def getHTMLText(url):
    try:
        kv={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
        cook_dict={'cookie':'cna=OHYzFlxpCgQCAUe6kNSsqoje; thw=ca; hng=US%7Czh-CN%7CUSD%7C840; miid=1759477185591050612; t=86ef6455e34fa74ef8073b0294ba59c4; _fbp=fb.1.1597175403806.1228562458; sgcookie=EwLwA56IkyVfwkAYsuIji; uc3=vt3=F8dBxG2nhNGf6ANmj2E%3D&nk2=saDQmvYSSNhcwz6%2Ftu1PMg%3D%3D&id2=UUtIF0HlMdX9gg%3D%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; lgc=%5Cu4E00%5Cu751F%5Cu4E00%5Cu4E16%5Cu77E2%5Cu5FD7%5Cu4E0D%5Cu6E1D; uc4=id4=0%40U2lyjfkOdcEEGNaGFcBY53X4unzv&nk4=0%40s8WQPlk07CQNbiZZSRanHDOHZLmOlqprLVZG; tracknick=%5Cu4E00%5Cu751F%5Cu4E00%5Cu4E16%5Cu77E2%5Cu5FD7%5Cu4E0D%5Cu6E1D; _cc_=VT5L2FSpdA%3D%3D; enc=wQ03VR5YiiTpokMPWtaWQTijCIuU411CeGWlHZG9Fm3J1QTjQijDJmLiXPjyZ2oZ%2FsGI3oxoV8FEOKaBEmWPiA%3D%3D; mt=ci=-1_0; xlly_s=1; cookie2=13a2bbd859744d81891b5cd6ea56f77b; _tb_token_=b30e880d5ee8; v=0; _m_h5_tk=39bb97b42dc8e95ae09a91c97c52dcca_1598914400674; _m_h5_tk_enc=4af73e4605197a7a0d9bf93ad109ad2e; uc1=cookie14=UoTV5OMV2j5nnw%3D%3D; isg=BCkpBsqYUhWVx2-0KyrRS3jjONWD9h0oH_TJWMse_JBPkkikE0bK-CFDUC6kCrVg; l=eBNBEpSrQhx-NqOdBOfZnurza77TTIRfguPzaNbMiOCPO0CH5jdcWZPIxKLMCnMNH6okR3-WSGYkBALpHyhSnxv9-3k_J_4Z3dC..; tfstk=cjI1Bb48kfc1J6aqbOweg-Up3UtVa3lB3P9ACNZzpVO3oxSe9sAo4QwP-ZcovUpC'}
        cookies = requests.utils.cookiejar_from_dict(cook_dict, cookiejar=None, overwrite=True)
        rs = requests.Session()
        rs.cookies = cookies;
        r=rs.get(url,timeout=30,headers=kv)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ''
    
def parsePage(ilt,html):
    try:
        plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
        sale=re.findall(r'\"view_sales\"\:\"[\d+].*?\"',html)
        # plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"\,\"pic_url\"\:\".*?\"',html)
        for i in range(len(plt)):
            price=eval(plt[i].split(':')[1])
            title=eval(tlt[i].split(':')[1])
            saleNum=eval(sale[i].split(':')[1])
            # item=eval(re.split(':|,',plt[i]))
            ilt.append([price,title])
    except:
        print('EOF')
    

def printGoodsList(ilt):
    tplt='{:4}\t{:8}\t{:16}'
    print(tplt.format('序号','价格','商品名称'))
    count=0
    for g in ilt:
        count=count+1
        print(tplt.format(count,g[0],g[1]))

def main():
    goods='戒指'
    depth=2
    start_url ='https://s.taobao.com/search?q='+goods
    infoList=[]
    for i in range(depth):
        try:
            url=start_url+'&s='+str(44*i)
            html=getHTMLText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodsList(infoList)
    
main()

股票定向爬取

教学视频的网页不可用了。。所以自己换些符合要求的网页~

# -*- coding: utf-8 -*-
"""
Created on Mon Aug 31 19:49:50 2020

@author: 99488
"""

import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHTMLText(url):
    try:
        r=requests.get(url)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return "~"
    
def getStockList(lst,stockURL):
    html=getHTMLText(stockURL)
    soup=BeautifulSoup(html,'html.parser')
    a=soup.find_all('a')
    for i in a:
        try:
            href=i.attrs['href']
            match=re.findall(r"\d{6}",href)
            if len(match)!=0:
                lst.append(match)
        except:
            continue
    
def getStockInfo(lst,stockURL,fpath):
    for stock in lst:
        url=stockURL+stock[0]
        html=getHTMLText(url)
        try:
            if html=="":
                continue
            infoDict={}
            soup=BeautifulSoup(html,'html.parser')
            stockInfo=soup.find('div',attrs={'class':'detail-data'})
            name=soup.find_all(attrs={'class':'name'})[0]
            infoDict.update({'股票名称':name.text.split()[0]})
            
            keyList=stockInfo.find_all('dt')
            valueList=stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key=keyList[i].text
                val=valueList[i].text
                infoDict[key]=val
                
            with open (fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
            
        except:
            traceback.print_exc()
            continue
            
    
    
def main():
    stock_list_url='https://www.banban.cn/gupiao/list_sh.html'
    stock_info_url='https://www.laohu8.com/stock/'
    output_file='D:\\master\\learning\\cs\\python_crawler\\stockInfo.txt'
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)
    return ""
    
main()

小的爬虫可以通过直接给出编码方式而非r.encoding=r.apparent_encoding动态解析去节省时间~

reference:
1. https://blog.csdn.net/weixin_38819889/article/details/93846579?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-3.channel_param&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-3.channel_param
2. https://blog.csdn.net/Aelous_dp/article/details/107445147

发表评论

邮箱地址不会被公开。 必填项已用*标注