根据您提供的代码,我发现在 parse_detail
函数中,您使用了 response.xpath
获取数据,但是没有指定默认的命名空间。因为该页面中有 XML 命名空间声明,如果不指定默认的命名空间,则无法正确解析 HTML 页面。
可以在函数开头添加以下代码指定默认的命名空间:
def parse_detail(self, response):
# 指定默认命名空间
namespaces = {'xmlns': 'http://www.w3.org/1999/xhtml'}
jd_item = JournalsDetailItem()
jd_item['name'] = response.xpath('//xmlns:h3[@class="titbox titbox1"]/text()', namespaces=namespaces)\
.extract_first().strip()
jd_item['institute'] = response.xpath('//xmlns:ul[@id="JournalBaseInfo"]/li/span/text()', namespaces=namespaces)\
.extract_first().strip()
jd_item['total_publications'] = response.xpath('//xmlns:ul[@id="publishInfo"]/li/span/text()', namespaces=namespaces)\
.extract_first().strip()
yield jd_item
另外,在 SeleniumRequest
中,可以将 wait_time
和 wait_sleep
参数改为 wait_time=10, wait_until=ec.presence_of_element_located((By.XPATH, '//h3[@class="titbox titbox1"]'))
,表示等待元素加载完成后再执行回调函数。
修改后的代码如下:
import scrapy
import time
from ..items import JournalsItem, JournalsDetailItem, JournalCoverItem
from scrapy.linkextractors import LinkExtractor
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait
class CnkiSpider(scrapy.Spider):
name = 'cnki'
#allowed_domains = ['navi.cnki.net']
start_urls = ['https://navi.cnki.net/knavi/journals/index']
PAGE_MAX = 290 # only 290 pages of journals have an impact factor
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
#while True:
# wait = WebDriverWait(driver, 10)
time.sleep(2)
self.driver.find_element('xpath','//*[@id="rightnavi"]/ul/li[2]/a').click()
time.sleep(5)
html = self.driver.page_source.encode('utf-8')
response_obj = scrapy.http.HtmlResponse(url=self.driver.current_url, body=html)
for journal in response_obj.xpath('//div[@class="detials"]'):
ji = JournalsItem()
ji['name'] = journal.xpath('.//h1/text()').extract_first().strip()
ji['composite_if'] = journal.xpath('.//p/text()').extract_first().strip()
yield ji
for journal_link in response_obj.xpath('//ul[@class="list_tup"]/li/a[@target="_blank"]/@href').extract():
full_url = f'https://navi.cnki.net{journal_link}'
yield SeleniumRequest(
url=full_url,
callback=self.parse_detail,
wait_time=10,
wait_until=ec.presence_of_element_located((By.XPATH, '//h3[@class="titbox titbox1"]')))
def parse_detail(self, response):
# 指定默认命名空间
namespaces = {'xmlns': 'http://www.w3.org/1999/xhtml'}
jd_item = JournalsDetailItem()
jd_item['name'] = response.xpath('//xmlns:h3[@class="titbox titbox1"]/text()', namespaces=namespaces)\
.extract_first().strip()
jd_item['institute'] = response.xpath('//xmlns:ul[@id="JournalBaseInfo"]/li/span/text()', namespaces=namespaces)\
.extract_first().strip()
jd_item['total_publications'] = response.xpath('//xmlns:ul[@id="publishInfo"]/li/span/text()', namespaces=namespaces)\
.extract_first().strip()
yield jd_item
def close(self, reason):
self.driver.quit()
希望可以帮到您!