VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > Python基础教程 >
  • 爬虫(十七):Scrapy框架(四) 对接selenium爬取京东商品数据(4)

'div', class_="p-name p-name-type-2")
  • proc_dict['title'] = title.get_text().strip()
  • price = li.find(name='strong', class_="J_" + id)
  • proc_dict['price'] = price.get_text()
  • comment = li.find(name='a', id="J_comment_" + id)
  • proc_dict['comment'] = comment.get_text() + '条评论'
  • url = 'https://item.jd.com/' + id + '.html'
  • proc_dict['url'] = url
  • proc_dict['type'] = 'JINGDONG'
  • yield proc_dict
  • middlewares.py:

    
    	
    1. # -*- coding: utf-8 -*-
    2.  
    3. # Define here the models for your spider middleware
    4. #
    5. # See documentation in:
    6. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    7.  
    8. from scrapy import signals
    9. from selenium import webdriver
    10. from selenium.webdriver.common.by import By
    11. from selenium.webdriver.common.keys import Keys
    12. from selenium.webdriver.support import expected_conditions as EC
    13. from selenium.webdriver.support.wait import WebDriverWait
    14. from urllib.parse import urlencode
    15. from scrapy.http import HtmlResponse
    16. from logging import getLogger
    17. from selenium.common.exceptions import TimeoutException
    18. import time
    19.  
    20.  
    21. class ScrapyseleniumtestSpiderMiddleware(object):
    22. # Not all methods need to be defined. If a method is not defined,
    23. # scrapy acts as if the spider middleware does not modify the
    24. # passed objects.
    25.  
    26. @classmethod
    27. def from_crawler(cls, crawler):
    28. # This method is used by Scrapy to create your spiders.
    29. s = cls()
    30. crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
    31. return s
    32.  
    33. def process_spider_input(self, response, spider):
    34. # Called for each response that goes through the spider
    35. # middleware and into the spider.
    36.  
    37. # Should return None or raise an exception.
    38. return None
    39.  
    40. def process_spider_output(self, response, result, spider):
    41. # Called with the results returned from the Spider, after
    42. # it has processed the response.
    43.  
    44. # Must return an iterable of Request, dict or Item objects.
    45. for i in result:
    46. yield i
    47.  
    48. def process_spider_exception(self, response, exception, spider):
    49. # Called when a spider or process_spider_input() method
    50. # (from other spider middleware) raises an exception.
    51.  
    52. # Should return either None or an iterable of Response, dict
    53. # or Item objects.
    54. pass
    55.  
    56. def process_start_requests(self, start_requests, spider):
    57. # Called with the start requests of the spider, and works
    58. # similarly to the process_spider_output() method, except
    59. # that it doesn’t have a response associated.
    60.  
    61. # Must return only requests (not items).
    62. for r in start_requests:
    63. yield r
    64.  
    65. def spider_opened(self, spider):
    66. spider.logger.info('Spider opened: %s' % spider.name)
    67.  
    68.  
    69. class SeleniumMiddleware(object):
    70. # Not all methods need to be defined. If a method is not defined,
    71. # scrapy acts as if the downloader middleware does not modify the
    72. # passed objects.
    73.  
    74. def __init__(self,timeout=None):
    75. self.logger=getLogger(__name__)
    76. self.timeout = timeout
    77. self.browser = webdriver.Chrome()
    78. self.browser.
    
    相关教程