VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > Python基础教程 >
  • 爬虫(十七):Scrapy框架(四) 对接selenium爬取京东商品数据(6)

mongo_url
  • self.mongo_db = mongo_db
  • self.collection = collection
  •  
  • @classmethod
  • #from_crawler是一个类方法,由 @classmethod标识,是一种依赖注入的方式,它的参数就是crawler
  • #通过crawler我们可以拿到全局配置的每个配置信息,在全局配置settings.py中的配置项都可以取到。
  • #所以这个方法的定义主要是用来获取settings.py中的配置信息
  • def from_crawler(cls,crawler):
  • return cls(
  • mongo_url=crawler.settings.get('MONGO_URL'),
  • mongo_db = crawler.settings.get('MONGO_DB'),
  • collection = crawler.settings.get('COLLECTION')
  • )
  •  
  • def open_spider(self,spider):
  • self.client = pymongo.MongoClient(self.mongo_url)
  • self.db = self.client[self.mongo_db]
  •  
  • def process_item(self,item, spider):
  • # name = item.__class__.collection
  • name = self.collection
  • self.db[name].insert(dict(item))
  • return item
  •  
  • def close_spider(self,spider):
  • self.client.close()
  • settings.py:

    
    	
    1. # -*- coding: utf-8 -*-
    2.  
    3. # Scrapy settings for scrapyseleniumtest project
    4. #
    5. # For simplicity, this file contains only settings considered important or
    6. # commonly used. You can find more settings consulting the documentation:
    7. #
    8. # https://docs.scrapy.org/en/latest/topics/settings.html
    9. # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    10. # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    11.  
    12. BOT_NAME = 'scrapyseleniumtest'
    13.  
    14. SPIDER_MODULES = ['scrapyseleniumtest.spiders']
    15. NEWSPIDER_MODULE = 'scrapyseleniumtest.spiders'
    16.  
    17.  
    18. # Crawl responsibly by identifying yourself (and your website) on the user-agent
    19. #USER_AGENT = 'scrapyseleniumtest (+http://www.yourdomain.com)'
    20.  
    21. # Obey robots.txt rules
    22. ROBOTSTXT_OBEY = False
    23.  
    24. # Configure maximum concurrent requests performed by Scrapy (default: 16)
    25. #CONCURRENT_REQUESTS = 32
    26.  
    27. # Configure a delay for requests for the same website (default: 0)
    28. # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    29. # See also autothrottle settings and docs
    30. #DOWNLOAD_DELAY = 3
    31. # The download delay setting will honor only one of:
    32. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    33. #CONCURRENT_REQUESTS_PER_IP = 16
    34.  
    35. # Disable cookies (enabled by default)
    36. #COOKIES_ENABLED = False
    37.  
    38. # Disable Telnet Console (enabled by default)
    39. #TELNETCONSOLE_ENABLED = False
    40.  
    41. # Override the default request headers:
    42. #DEFAULT_REQUEST_HEADERS = {
    43. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    44. # 'Accept-Language': 'en',
    45. #}
    46.  
    47. # Enable or disable spider middlewares
    48. # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    49. #SPIDER_MIDDLEWARES = {
    50. # 'scrapyseleniumtest.middlewares.ScrapyseleniumtestSpiderMiddleware': 543,
    51. #}
    52.  
    53. # Enable or disable downloader middlewares
    54. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    55. #DOWNLOADER_MIDDLEWARES = {
    56. # 'scrapyseleniumtest.middlewares.ScrapyseleniumtestDownloaderMiddleware': 543,
    57. #}
    58. DOWNLOADER_MIDDLEWARES = {
    59. 'scrapyseleniumtest.middlewares.SeleniumMiddleware': 543,
    60. }
    61. # Enable or disable extensions
    62. # See https://docs.scrapy.org/en/latest/topics/extensions.html
    63. #EXTENSIONS = {
    64. # 'scrapy.extensions.telnet.TelnetConsole': None,
    65. #}
    66.  
    67. # Configure item pipelines
    68. # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    69. #ITEM_PIPELINES = {
    70. # 'scrapyseleniumtest.pipelines.ScrapyseleniumtestPipeline': 300,
    71. #}
    72. ITEM_PIPELINES = {
    73. 'scrapyseleniumtest.pipelines.MongoPipeline': 300,
    74. }
    75. # Enable and configure the AutoThrottle extension (disabled by default)
    76. # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    77. #AUTOTHROTTLE_ENABLED = True
    78. # The initial download delay
    79. #AUTOTHROTTLE_START_DELAY = 5
    80. # The maximum download delay to be set in case of high latencies
    81. #AUTOTHROTTLE_MAX_DELAY = 60
    82. # The average number of requests Scrapy should be sending in parallel to
    83. # each remote server
    84. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    85. # Enable showing throttling stats for every response received:
    86. #AUTOTHROTTLE_DEBUG = False
    87.  
    88. # Enable and configure HTTP caching (disabled by default)
    89. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    90. #HTTPCACHE_ENABLED = True
    91. #HTTPCACHE_EXPIRATION_SECS = 0
    92. #HTTPCACHE_DIR = 'httpcache'
    93. #HTTPCACHE_IGNORE_HTTP_CODES = []
    94. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    95. KEYWORDS=['iPad']
    96. MAX_PAGE=2
    97.  
    98. MONGO_URL = 'localhost
    
    相关教程