VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > temp > python入门教程 >
  • PYTHON爬取图片

PYTHON爬取图片

 
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import requests
from lxml import etree
from urllib import parse

# 异常处理还未优化,后续补上
# 未解决问题1:这是爬取多个页面的当前所有图片,图片内部的还未处理
# 未解决问题2:当爬取页面过多时,会报错,原因还未找到,后续补上

headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
    # 防盗链 : 朔源,当前本次请求的上一级是谁
    "Referer": "https://xxx"
}


def get_img_src(q):
    urls = []
    for i in range(1, 5):
        if i == 1:
            a = https://xxx/index.html
        else:
            a = f"https://xxx/{i}.html"
        urls.append(a)
    href_list_all = []
    for i in urls:
        resp = requests.get(i, headers=headers)
        resp.encoding = 'utf-8'
        tree = etree.HTML(resp.text)
        href_list = tree.xpath("//div[@class='list-box-p']/ul/li/a/@href")
        href_list_all.append(href_list)

    for all_list in href_list_all:
        for href in all_list:
            child_resp = requests.get(href, headers=headers)
            child_resp.encoding = 'utf-8'
            child_tree = etree.HTML(child_resp.text)
            src = child_tree.xpath("//div[@class='img_box']/a/img/@src")[0]  # 注意这里获取的是列表,需要取里面的下标为0的第一个元素值
            q.put(src)  # 循环向队列里装东西,后面好给下载用
            print(f"---------------------------------------------------被塞进队列--------------------->{src}")
    q.put("已经没了")


def download(src):
    print('开始下载------------>', src)
    name = src.split('/')[-1]
    with open("./image/" + name, mode='wb') as f:
        resp = requests.get(src, headers=headers)
        f.write(resp.content)
    print('下载完毕------------>', src)


def download_img(q):
    with ThreadPoolExecutor(5) as t:
        while 1:
            src = q.get()  # 从队列里拿东西,如果没数据就阻塞,一直等着有数据来
            if src == "已经没了":
                break
            t.submit(download, src)


if __name__ == '__main__':
    q = Queue()
    p1 = Process(target=get_img_src, args=(q,))
    p2 = Process(target=download_img, args=(q,))
    p1.start()
    p2.start()
 


相关教程