Scrapy with Selenium crawling but not scraping
Let's think about it a bit differently:
- open the page in the browser and click "Show More" until you get to the desired page
- initialize a scrapy
TextResponse
with the current page source (with all necessary posts loaded) - for every post initialize an
Item
, yield aRequest
to the post page and pass anitem
instance from a request to a response in themeta
dictionary
Notes and changes I'm introducing:
- use a normal
Spider
class - use Selenium Waits to wait for the "Show More" button to be visible
- closing the driver instance in
spider_closed
signal dispatcher
The code:
import scrapyfrom scrapy import signalsfrom scrapy.http import TextResponse from scrapy.xlib.pydispatch import dispatcherfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECclass ItalkiItem(scrapy.Item): title = scrapy.Field() url = scrapy.Field() text = scrapy.Field()class ItalkiSpider(scrapy.Spider): name = "italki" allowed_domains = ['italki.com'] start_urls = ['http://www.italki.com/entries/korean'] def __init__(self): self.driver = webdriver.Firefox() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): self.driver.close() def parse(self, response): # selenium part of the job self.driver.get('http://www.italki.com/entries/korean') while True: more_btn = WebDriverWait(self.driver, 10).until( EC.visibility_of_element_located((By.ID, "a_show_more")) ) more_btn.click() # stop when we reach the desired page if self.driver.current_url.endswith('page=52'): break # now scrapy should do the job response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') for post in response.xpath('//ul[@id="content"]/li'): item = ItalkiItem() item['title'] = post.xpath('.//a[@class="title_txt"]/text()').extract()[0] item['url'] = post.xpath('.//a[@class="title_txt"]/@href').extract()[0] yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_post) def parse_post(self, response): item = response.meta['item'] item["text"] = response.xpath('//div[@id="a_NMContent"]/text()').extract() return item
This is something you should use as a base code and improve to fill out all other fields, like author
or author_url
. Hope that helps.