VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > Python基础教程 >
  • 爬虫(十七):Scrapy框架(四) 对接selenium爬取京东商品数据(5)

set_window_size(1400,700)
  • self.browser.set_page_load_timeout(self.timeout)
  • self.wait = WebDriverWait(self.browser,self.timeout)
  •  
  • def __del__(self):
  • self.browser.close()
  •  
  • @classmethod
  • def from_crawler(cls, crawler):
  • # This method is used by Scrapy to create your spiders.
  • return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))
  •  
  • def process_request(self, request, spider):
  • '''
  • 在下载器中间件中对接使用selenium,输出源代码之后,构造htmlresponse对象,直接返回
  • 给spider解析页面,提取数据
  • 并且也不在执行下载器下载页面动作
  • htmlresponse对象的文档:
  • :param request:
  • :param spider:
  • :return:
  • '''
  •  
  • print('PhantomJS is Starting')
  • page = request.meta.get('page', 1)
  • self.wait = WebDriverWait(self.browser, self.timeout)
  • # self.browser.set_page_load_timeout(30)
  • # self.browser.set_script_timeout(30)
  • try:
  • self.browser.get(request.url)
  • if page > 1:
  • input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
  • input.clear()
  • input.send_keys(page)
  • time.sleep(5)
  •  
  • # 将网页中输入跳转页的输入框赋值给input变量 EC.presence_of_element_located,判断输入框已经被加载出来
  • input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
  • # 将网页中调准页面的确定按钮赋值给submit变量,EC.element_to_be_clickable 判断此按钮是可点击的
  • submit = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
  • input.clear()
  • input.send_keys(page)
  • submit.click() # 点击按钮
  • time.sleep(5)
  •  
  • # 判断当前页码出现在了输入的页面中,EC.text_to_be_present_in_element 判断元素在指定字符串中出现
  • self.wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'),str(page)))
  • # 等待 #J_goodsList 加载出来,为页面数据,加载出来之后,在返回网页源代码
  • self.wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'),str(page)))
  • return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',status=200)
  • except TimeoutException:
  • return HtmlResponse(url=request.url, status=500, request=request)
  •  
  •  
  • def process_response(self, request, response, spider):
  • # Called with the response returned from the downloader.
  •  
  • # Must either;
  • # - return a Response object
  • # - return a Request object
  • # - or raise IgnoreRequest
  • return response
  •  
  • def process_exception(self, request, exception, spider):
  • # Called when a download handler or a process_request()
  • # (from other downloader middleware) raises an exception.
  •  
  • # Must either:
  • # - return None: continue processing this exception
  • # - return a Response object: stops process_exception() chain
  • # - return a Request object: stops process_exception() chain
  • pass
  •  
  • def spider_opened(self, spider):
  • spider.logger.info('Spider opened: %s' % spider.name)
  • pipelines.py:

    
    	
    1. # -*- coding: utf-8 -*-
    2.  
    3. # Define your item pipelines here
    4. #
    5. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    6. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    7. import pymongo
    8.  
    9. class MongoPipeline(object):
    10.  
    11. def __init__(self,mongo_url,mongo_db,collection):
    12. self.mongo_url =
    
    相关教程