1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
| import requests import re import time import random import lxml.etree from lxml.html import tostring from lxml import etree
"""初始化参数""" kw = '篮球' base_url = 'http://tieba.baidu.com/f' headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46"} page_num = 1 title = '' path = 'E:\\作业图片\\spider\\'
def parse_text(url, params=None): """发送请求,获取响应内容"""
time.sleep(random.randint(1, 5))
req = requests.get(url, headers=headers, params=params) return req.text
def parse_byte(url, params=None): """发送请求,获取响应内容"""
time.sleep(random.random() * 2) req = requests.get(url, headers=headers, params=params) return req.content
def page(content, page_num=1): """解析每一页"""
print('第{}页爬取中...'.format(page_num)) page_num += 1
url_title = re.findall( r'<a rel="noreferrer" href="(/p/\d+?)" title=".+?" target="_blank" class="j_th_tit ">(.+?)</a>', content) url_title1 = lxml.etree.HTML(content)
url_title2 = url_title1.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
for i in url_title2: print(i)
for url, title in url_title: title = re.sub('[^\u4e00-\u9fa5]+', '', title) detail('https://tieba.baidu.com' + url, title) save_title(title)
next_url = re.findall(r'<a href="(.*?)" .*?>下一页></a>', content) if next_url: next_url = 'https:' + next_url[0] content = parse_text(url=next_url) page(content, page_num) else: print('爬虫结束...')
def detail(url, title): """每一个帖子的详情""" content = parse_text(url=url) urls = re.findall(r'<img class="BDE_Image".*?src="(.*?)".*?>', content) for url in urls: save_img(url, title)
def save_title(title): """保存帖子的标题""" with open(path + 'tieba\\tieba_{}.txt'.format(kw), 'a', encoding='utf-8') as file: file.write(title) file.write('\n')
def save_img(url, title): """保存图片""" content = parse_byte(url=url) image_path = path + 'tieba\\images\\{}_{}'.format(title, url[-30:]) with open(image_path, 'wb') as file: file.write(content)
print('爬虫开始...') content = parse_text(url=base_url, params={'kw': kw, 'ie': 'utf-8', 'fr': 'search'}) page(content)
|