ZBLOG

import scrapy import time from ..items import JournalsItem, JournalsDetailItem, JournalCoverItem from scrapy.linkextractors import LinkExtractor # from scrapy_selenium import SeleniumRequest from selenium import webdriver from selenium.webdriver.c...

import scrapy import time from ..items import JournalsItem, JournalsDetailItem, JournalCoverItem from scrapy.linkextractors import LinkExtractor

from scrapy_selenium import SeleniumRequest

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import WebDriverWait

class CnkiSpider(scrapy.Spider):

name = 'cnki'
allowed_domains = ['navi.cnki.net']
start_urls = ['https://navi.cnki.net/knavi/journals/index']
PAGE_MAX = 290 # only 290 pages of journals have an impact factor

def __init__(self):
    self.driver = webdriver.Chrome()

def parse(self, response):
    self.driver.get(response.url)

    #while True:
        # wait = WebDriverWait(driver, 10)
    time.sleep(5)
    try:
        element_present = ec.presence_of_element_located((By.XPATH,'//*[@id="rightnavi"]/ul/li[2]/a'))
        WebDriverWait(self.driver, 10).until(element_present)

        self.driver.find_element(By.XPATH,'//*[@id="rightnavi"]/ul/li[2]/a')

        html = self.driver.page_source.encode('utf-8')
        response_obj = scrapy.http.HtmlResponse(url=self.driver.current_url, body=html)


        for journal in response_obj.xpath('//div[@class="detials"]'):
            ji = JournalsItem()
            ji['name'] = journal.xpath('.//h1/text()').extract_first().strip()
            ji['composite_if'] = journal.xpath('.//p/text()').extract_first().strip()
            yield ji 


    finally:
         self.driver.quit()

# def close(self, reason):

  #  self.driver.quit()

本站部分文章来源于网络,版权归原作者所有,如有侵权请联系站长删除。
转载请注明出处:https://sdn.0voice.com/?id=847

分享:
扫描分享到社交APP
上一篇
下一篇
发表列表
游客 游客
此处应有掌声~
评论列表

还没有评论,快来说点什么吧~

联系我们

在线咨询: 点击这里给我发消息

微信号:3007537140

上班时间: 10:30-22:30

关注我们
x

注册

已经有帐号?