当前位置:
首页 > 编程开发 > Python基础教程 >
-
python基础教程之爬虫篇-递归爬取今日头条指定用户一个月内发表的所有文章,视频,微头条(2)
本站最新发布 Python从入门到精通|Python基础教程
试听地址 https://www.xin3721.com/eschool/pythonxin3721/
'/toutiao/' + str(csv_name) + '视频.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers2) # f_csv.writeheader() f_csv.writerow(row) print('正在爬取视频:', video_title, detail_url, video_url) time.sleep(3) except Exception as e: print(e, 'https://www.ixigua.com/i' + i['item_id']) shipin(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) except KeyError: n += 1 print('第' + str(n) + '次请求', first_url) time.sleep(3) if n == max_qingqiu: print('请求超过最大次数') break_flag_video.append(1) except Exception as e: print(e) else: pass # 微头条 break_flag_weitoutiao = [] def weitoutiao(url, max_behot_time=0, n=0, csv_name=0): max_qingqiu = 20 headers3 = ['微头条发表时间', '来源', '标题', '文章内图片', '微头条内容'] while n < max_qingqiu and not break_flag_weitoutiao: try: first_url = 'https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=%s&max_behot_time=%s' % ( url.split('/')[-2], max_behot_time) # print(first_url) res = requests.get(first_url, headers=headers_a, cookies=cookies) data = json.loads(res.text) # print(data) max_behot_time = data['next']['max_behot_time'] weitoutiao_list = data['data'] for i in weitoutiao_list: try: detail_url = 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id']) # print(detail_url) resp = requests.get(detail_url, headers=headers(), cookies=cookies) start_time = re.findall("time: '(.*?)'", resp.text, re.S) weitoutiao_name = re.findall("name: '(.*?)'", resp.text, re.S) weitoutiao_title = re.findall("title: '(.*?)'", resp.text, re.S) weitoutiao_images = re.findall('images: \["(.*?)"\]',resp.text,re.S) # print(weitoutiao_images) if weitoutiao_images: weitoutiao_image = 'http:' + weitoutiao_images[0].replace('u002F','/').replace('\\','') # print(weitoutiao_image) else: weitoutiao_image = '此头条内无附件图片' weitoutiao_content = re.findall("content: '(.*?)'", resp.text, re.S) result_time = [] [result_time.append(i) for i in str(start_time[0]).split(' ')[0].replace('-', ',').split(',')] # print(result_time) cha = ( datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days # print(cha) if cha > 30: break_flag_weitoutiao.append(1) print('完成') break row = {'微头条发表时间': start_time[0], '来源': weitoutiao_name[0], '标题': weitoutiao_title[0].strip('"'),'文章内图片': weitoutiao_image, '微头条内容': weitoutiao_content[0].strip('"')} with open('/toutiao/' + str(csv_name) + '微头条.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers3) # f_csv.writeheader() f_csv.writerow(row) time.sleep(1) print('正在爬取微头条', weitoutiao_name[0], start_time[0], detail_url) except Exception as e: print(e, 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id'])) weitoutiao(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) except KeyError: n += 1 print('第' + str(n) + '次请求') time.sleep(2) if n == max_qingqiu: print('请求超过最大次数') break_flag_weitoutiao.append(1) else: pass except Exception as e: print(e) else: pass # 获取需要爬取的网站数据 def csv_read(path): data = [] with open(path, 'r', encoding='gb18030') as f: reader = csv.reader(f, dialect='excel') for row in reader: data.append(row) return data # 启动函数 def main(): for j, i in enumerate(csv_read('toutiao-suoyou.csv')): # data_url = data.get_nowait() if '文章' in i[3]: # 启动抓取文章函数 print('当前正在抓取文章第', j, i[2]) headers1 = ['发表时间', '标题', '来源', '所有图片', '文章内容'] with open('/toutiao/' + i[0] + '文章.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers1) f_csv.writeheader() break_flag.clear() wenzhang(url=i[2], csv_name=i[0]) if '视频' in i[3]: # 启动爬取视频的函数 print('当前正在抓取视频第', j, i[2]) headers2 = ['视频发表时间', '标题', '来源', '视频链接'] with open('/toutiao/' + i[0] + '视频.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers2) f_csv.writeheader() break_flag_video.clear() shipin(url=i[2], csv_name=i[0]) if '微头条' in i[3]: # 启动获取微头条的函数 headers3 = ['微头条发表时间', '来源', '标题', '文章内图片', '微头条内容'] print('当前正在抓取微头条第', j, i[2]) with open('/toutiao/' + i[0] + '微头条.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers3) f_csv.writeheader() break_flag_weitoutiao.clear() weitoutiao(url=i[2], csv_name=i[0]) # 多线程启用 def get_all(urlQueue): while True: try: # 不阻塞的读取队列数据 data_url = urlQueue.get_nowait() # i = urlQueue.qsize() except Exception as e: break # print(data_url) # if '文章' in data_url[3]: # # 启动抓取文章函数 # print('当前正在抓取文章', data_url[2]) # headers1 = ['发表时间', '标题', '来源', '所有图片', '文章内容'] # with open('/toutiao/' + data_url[0] + '文章.csv', 'a', newline='')as f: # f_csv = csv.DictWriter(f, headers1) # f_csv.writeheader() # break_flag.clear() # wenzhang(url=data_url[2], csv_name=data_url[0]) if '视频' in data_url[3]: # 启动爬取视频的函数 print('当前正在抓取视频', data_url[2]) headers2 = ['视频发表时间', '标题', '来源', '视频链接'] with open('/toutiao/' + data_url[0] + '视频.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers2) f_csv.writeheader() break_flag_video.clear() shipin(url=data_url[2], csv_name=data_url[0]) # # if '微头条' in data_url[3]: # # 启动获取微头条的函数 # headers3 = ['微头条发表时间', '来源', '标题','文章内图片', '微头条内容'] # print('当前正在抓取微头条', data_url[2]) # with open('/toutiao/' + data_url[0] + '微头条.csv', 'a', newline='')as f: # f_csv = csv.DictWriter(f, headers3) # f_csv.writeheader() # break_flag_weitoutiao.clear() # weitoutiao(url=data_url[2], csv_name=data_url[0]) if __name__ == '__main__': # 创建存储目录 path = '/toutiao/' if not os.path.exists(path): os.mkdir(path) """单一脚本使用main函数,开启多线程按照下面方法控制线程数,开启多线程会请求过于频繁,导致头条反爬封ip等,需要设置代理ip""" # main() urlQueue = Queue() for j, i in enumerate(csv_read('toutiao-suoyou.csv')): urlQueue.put(i) # print(urlQueue.get_nowait()) # print(urlQueue.qsize()) threads = [] # 可以调节线程数, 进而控制抓取速度 threadNum = 4 for i in range(0, threadNum): t = threading.Thread(target=get_all, args=(urlQueue,)) threads.append(t) for t in threads: # 设置为守护线程,当守护线程退出时,由它启动的其它子线程将同时退出, # t.setDaemon(True) t.start() for t in threads: # 多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞 t.join() # pass
试听地址 https://www.xin3721.com/eschool/pythonxin3721/
'/toutiao/' + str(csv_name) + '视频.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers2) # f_csv.writeheader() f_csv.writerow(row) print('正在爬取视频:', video_title, detail_url, video_url) time.sleep(3) except Exception as e: print(e, 'https://www.ixigua.com/i' + i['item_id']) shipin(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) except KeyError: n += 1 print('第' + str(n) + '次请求', first_url) time.sleep(3) if n == max_qingqiu: print('请求超过最大次数') break_flag_video.append(1) except Exception as e: print(e) else: pass # 微头条 break_flag_weitoutiao = [] def weitoutiao(url, max_behot_time=0, n=0, csv_name=0): max_qingqiu = 20 headers3 = ['微头条发表时间', '来源', '标题', '文章内图片', '微头条内容'] while n < max_qingqiu and not break_flag_weitoutiao: try: first_url = 'https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=%s&max_behot_time=%s' % ( url.split('/')[-2], max_behot_time) # print(first_url) res = requests.get(first_url, headers=headers_a, cookies=cookies) data = json.loads(res.text) # print(data) max_behot_time = data['next']['max_behot_time'] weitoutiao_list = data['data'] for i in weitoutiao_list: try: detail_url = 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id']) # print(detail_url) resp = requests.get(detail_url, headers=headers(), cookies=cookies) start_time = re.findall("time: '(.*?)'", resp.text, re.S) weitoutiao_name = re.findall("name: '(.*?)'", resp.text, re.S) weitoutiao_title = re.findall("title: '(.*?)'", resp.text, re.S) weitoutiao_images = re.findall('images: \["(.*?)"\]',resp.text,re.S) # print(weitoutiao_images) if weitoutiao_images: weitoutiao_image = 'http:' + weitoutiao_images[0].replace('u002F','/').replace('\\','') # print(weitoutiao_image) else: weitoutiao_image = '此头条内无附件图片' weitoutiao_content = re.findall("content: '(.*?)'", resp.text, re.S) result_time = [] [result_time.append(i) for i in str(start_time[0]).split(' ')[0].replace('-', ',').split(',')] # print(result_time) cha = ( datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days # print(cha) if cha > 30: break_flag_weitoutiao.append(1) print('完成') break row = {'微头条发表时间': start_time[0], '来源': weitoutiao_name[0], '标题': weitoutiao_title[0].strip('"'),'文章内图片': weitoutiao_image, '微头条内容': weitoutiao_content[0].strip('"')} with open('/toutiao/' + str(csv_name) + '微头条.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers3) # f_csv.writeheader() f_csv.writerow(row) time.sleep(1) print('正在爬取微头条', weitoutiao_name[0], start_time[0], detail_url) except Exception as e: print(e, 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id'])) weitoutiao(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) except KeyError: n += 1 print('第' + str(n) + '次请求') time.sleep(2) if n == max_qingqiu: print('请求超过最大次数') break_flag_weitoutiao.append(1) else: pass except Exception as e: print(e) else: pass # 获取需要爬取的网站数据 def csv_read(path): data = [] with open(path, 'r', encoding='gb18030') as f: reader = csv.reader(f, dialect='excel') for row in reader: data.append(row) return data # 启动函数 def main(): for j, i in enumerate(csv_read('toutiao-suoyou.csv')): # data_url = data.get_nowait() if '文章' in i[3]: # 启动抓取文章函数 print('当前正在抓取文章第', j, i[2]) headers1 = ['发表时间', '标题', '来源', '所有图片', '文章内容'] with open('/toutiao/' + i[0] + '文章.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers1) f_csv.writeheader() break_flag.clear() wenzhang(url=i[2], csv_name=i[0]) if '视频' in i[3]: # 启动爬取视频的函数 print('当前正在抓取视频第', j, i[2]) headers2 = ['视频发表时间', '标题', '来源', '视频链接'] with open('/toutiao/' + i[0] + '视频.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers2) f_csv.writeheader() break_flag_video.clear() shipin(url=i[2], csv_name=i[0]) if '微头条' in i[3]: # 启动获取微头条的函数 headers3 = ['微头条发表时间', '来源', '标题', '文章内图片', '微头条内容'] print('当前正在抓取微头条第', j, i[2]) with open('/toutiao/' + i[0] + '微头条.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers3) f_csv.writeheader() break_flag_weitoutiao.clear() weitoutiao(url=i[2], csv_name=i[0]) # 多线程启用 def get_all(urlQueue): while True: try: # 不阻塞的读取队列数据 data_url = urlQueue.get_nowait() # i = urlQueue.qsize() except Exception as e: break # print(data_url) # if '文章' in data_url[3]: # # 启动抓取文章函数 # print('当前正在抓取文章', data_url[2]) # headers1 = ['发表时间', '标题', '来源', '所有图片', '文章内容'] # with open('/toutiao/' + data_url[0] + '文章.csv', 'a', newline='')as f: # f_csv = csv.DictWriter(f, headers1) # f_csv.writeheader() # break_flag.clear() # wenzhang(url=data_url[2], csv_name=data_url[0]) if '视频' in data_url[3]: # 启动爬取视频的函数 print('当前正在抓取视频', data_url[2]) headers2 = ['视频发表时间', '标题', '来源', '视频链接'] with open('/toutiao/' + data_url[0] + '视频.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers2) f_csv.writeheader() break_flag_video.clear() shipin(url=data_url[2], csv_name=data_url[0]) # # if '微头条' in data_url[3]: # # 启动获取微头条的函数 # headers3 = ['微头条发表时间', '来源', '标题','文章内图片', '微头条内容'] # print('当前正在抓取微头条', data_url[2]) # with open('/toutiao/' + data_url[0] + '微头条.csv', 'a', newline='')as f: # f_csv = csv.DictWriter(f, headers3) # f_csv.writeheader() # break_flag_weitoutiao.clear() # weitoutiao(url=data_url[2], csv_name=data_url[0]) if __name__ == '__main__': # 创建存储目录 path = '/toutiao/' if not os.path.exists(path): os.mkdir(path) """单一脚本使用main函数,开启多线程按照下面方法控制线程数,开启多线程会请求过于频繁,导致头条反爬封ip等,需要设置代理ip""" # main() urlQueue = Queue() for j, i in enumerate(csv_read('toutiao-suoyou.csv')): urlQueue.put(i) # print(urlQueue.get_nowait()) # print(urlQueue.qsize()) threads = [] # 可以调节线程数, 进而控制抓取速度 threadNum = 4 for i in range(0, threadNum): t = threading.Thread(target=get_all, args=(urlQueue,)) threads.append(t) for t in threads: # 设置为守护线程,当守护线程退出时,由它启动的其它子线程将同时退出, # t.setDaemon(True) t.start() for t in threads: # 多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞 t.join() # pass
读取csv文件中的用户信息
抓取的结果
内容仅供参考学习使用,有意见可联系作者删除。。。。。。
栏目列表
最新更新
Python爬虫的开始——requests库建立请求
爬虫篇-递归爬取今日头条指定用户一个月
Python基础『一』
python 线程创建和传参(28)
爬取豆瓣电影排名的代码以及思路
如何让Python爬虫一天抓取100万张网页
Flask拾遗总汇1
python 异常处理(25)
pyinstaller打包python文件成exe(原理.安装.问
开发工具的安装与使用(第二弹)
.Net Standard(.Net Core)实现获取配置信息
Linux PXE + Kickstart 自动装机
Shell 编程 基础
Shell 编程 条件语句
CentOS8-网卡配置及详解
Linux中LVM逻辑卷管理
1.数码相框-相框框架分析(1)
Ubuntu armhf 版本国内源
Linux中raid磁盘阵列
搭建简易网站
mysql 安装了最新版本8.x版本后的报错:
Mysql空间数据&空间索引(spatial)
如何远程连接SQL Server数据库的图文教程
复制SqlServer数据库的方法
搜索sql语句
sql中返回参数的值
sql中生成查询的模糊匹配字符串
数据定义功能
数据操作功能
将Session值储存于SQL Server中