python - scrapy crawl spider ajax pagination -
i trying scrap link has ajax call pagination. trying crawl http://www.demo.com link. , in .py file provided code restrict xpath , coding is:
# -*- coding: utf-8 -*- import scrapy scrapy.contrib.linkextractors import linkextractor scrapy.contrib.spiders import sumspider, rule scrapy.selector import htmlxpathselector sum.items import sumitem class sumspider1(sumspider): name = 'sumdetailsurls' allowed_domains = ['sum.com'] start_urls = ['http://www.demo.com'] rules = ( rule(linkextractor(restrict_xpaths='.//ul[@id="pager"]/li[8]/a'), callback='parse_start_url', follow=true), ) #use parse_start_url if spider wants crawl first page , overriding def parse_start_url(self, response): print '********************************************1**********************************************' #//div[@class="showmorecars hide"]/a #.//ul[@id="pager"]/li[8]/a/@href self.log('inside - parse_item %s' % response.url) hxs = htmlxpathselector(response) item = sumitem() item['page'] = response.url title = hxs.xpath('.//h1[@class="page-heading"]/text()').extract() print '********************************************title**********************************************',title urls = hxs.xpath('.//a[@id="linktodetails"]/@href').extract() print '**********************************************2***url*****************************************',urls finalurls = [] url in urls: print '---------url-------',url finalurls.append(url) item['urls'] = finalurls return item
my items.py file contains
from scrapy.item import item, field class sumitem(item): # define fields item here like: # name = scrapy.field() page = field() urls = field()
still i'm not getting exact output not able fetch pages when crawling it.
i hope below code help.
somespider.py
# -*- coding: utf-8 -*- import scrapy import re scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import selector scrapy.spider import basespider demo.items import demoitem selenium import webdriver def removeunicodes(strdata): if(strdata): strdata = strdata.encode('utf-8').strip() strdata = re.sub(r'[\n\r\t]',r' ',strdata.strip()) return strdata class demospider(scrapy.spider): name = "domainurls" allowed_domains = ["domain.com"] start_urls = ['http://www.domain.com/used/cars-in-trichy/'] def __init__(self): self.driver = webdriver.remote("http://127.0.0.1:4444/wd/hub", webdriver.desiredcapabilities.htmlunitwithjs) def parse(self, response): self.driver.get(response.url) self.driver.implicitly_wait(5) hxs = selector(response) item = demoitem() finalurls = [] while true: next = self.driver.find_element_by_xpath('//div[@class="showmorecars hide"]/a') try: next.click() # data , write scrapy items item['pageurl'] = response.url item['title'] = removeunicodes(hxs.xpath('.//h1[@class="page-heading"]/text()').extract()[0]) urls = self.driver.find_elements_by_xpath('.//a[@id="linktodetails"]') url in urls: url = url.get_attribute("href") finalurls.append(removeunicodes(url)) item['urls'] = finalurls except: break self.driver.close() return item
items.py
from scrapy.item import item, field class demoitem(item): page = field() urls = field() pageurl = field() title = field()
note: need have selenium rc server running because htmlunitwithjs works selenium rc using python.
run selenium rc server issuing command :
java -jar selenium-server-standalone-2.44.0.jar
run spider using command:
spider crawl domainurls -o someoutput.json
Comments
Post a Comment