import scrapy import time from ..items import JournalsItem, JournalsDetailItem, JournalCoverItem from scrapy.linkextractors import LinkExtractor
from scrapy_selenium import SeleniumRequest
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import WebDriverWait
class CnkiSpider(scrapy.Spider):
name = 'cnki'
allowed_domains = ['navi.cnki.net']
start_urls = ['https://navi.cnki.net/knavi/journals/index']
PAGE_MAX = 290 # only 290 pages of journals have an impact factor
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
#while True:
# wait = WebDriverWait(driver, 10)
time.sleep(5)
try:
element_present = ec.presence_of_element_located((By.XPATH,'//*[@id="rightnavi"]/ul/li[2]/a'))
WebDriverWait(self.driver, 10).until(element_present)
self.driver.find_element(By.XPATH,'//*[@id="rightnavi"]/ul/li[2]/a')
html = self.driver.page_source.encode('utf-8')
response_obj = scrapy.http.HtmlResponse(url=self.driver.current_url, body=html)
for journal in response_obj.xpath('//div[@class="detials"]'):
ji = JournalsItem()
ji['name'] = journal.xpath('.//h1/text()').extract_first().strip()
ji['composite_if'] = journal.xpath('.//p/text()').extract_first().strip()
yield ji
finally:
self.driver.quit()
# def close(self, reason):
# self.driver.quit()