import scrapy import time from ..items import JournalsItem, JournalsDetailItem, JournalCoverItem from scrapy.linkextractors import LinkExtractor from scrapy_selenium import SeleniumRequest from selenium import webdriver from selenium.webdriver.com...-DPDK中文网

根据您提供的代码，我发现在 parse_detail 函数中，您使用了 response.xpath 获取数据，但是没有指定默认的命名空间。因为该页面中有 XML 命名空间声明，如果不指定默认的命名空间，则无法正确解析 HTML 页面。

可以在函数开头添加以下代码指定默认的命名空间：

def parse_detail(self, response):
    # 指定默认命名空间
    namespaces = {'xmlns': 'http://www.w3.org/1999/xhtml'}

    jd_item = JournalsDetailItem()
    jd_item['name'] = response.xpath('//xmlns:h3[@class="titbox titbox1"]/text()', namespaces=namespaces)\
                        .extract_first().strip()
    jd_item['institute'] = response.xpath('//xmlns:ul[@id="JournalBaseInfo"]/li/span/text()', namespaces=namespaces)\
                            .extract_first().strip()
    jd_item['total_publications'] = response.xpath('//xmlns:ul[@id="publishInfo"]/li/span/text()', namespaces=namespaces)\
                                        .extract_first().strip()

    yield jd_item

另外，在 SeleniumRequest 中，可以将 wait_time 和 wait_sleep 参数改为 wait_time=10, wait_until=ec.presence_of_element_located((By.XPATH, '//h3[@class="titbox titbox1"]')) ，表示等待元素加载完成后再执行回调函数。

修改后的代码如下：

import scrapy
import time
from ..items import JournalsItem, JournalsDetailItem, JournalCoverItem
from scrapy.linkextractors import LinkExtractor
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait


class CnkiSpider(scrapy.Spider):
    name = 'cnki'
    #allowed_domains = ['navi.cnki.net']
    start_urls = ['https://navi.cnki.net/knavi/journals/index']
    PAGE_MAX = 290 # only 290 pages of journals have an impact factor

    def __init__(self):
        self.driver = webdriver.Chrome()

    def parse(self, response):
        self.driver.get(response.url)

        #while True:
            # wait = WebDriverWait(driver, 10)
        time.sleep(2)
        self.driver.find_element('xpath','//*[@id="rightnavi"]/ul/li[2]/a').click()
        time.sleep(5)
        html = self.driver.page_source.encode('utf-8')
        response_obj = scrapy.http.HtmlResponse(url=self.driver.current_url, body=html)

        for journal in response_obj.xpath('//div[@class="detials"]'):
            ji = JournalsItem()
            ji['name'] = journal.xpath('.//h1/text()').extract_first().strip()
            ji['composite_if'] = journal.xpath('.//p/text()').extract_first().strip()
            yield ji

        
        for journal_link in response_obj.xpath('//ul[@class="list_tup"]/li/a[@target="_blank"]/@href').extract():
            full_url = f'https://navi.cnki.net{journal_link}'
            yield SeleniumRequest(
                url=full_url,
                callback=self.parse_detail,
                wait_time=10,
                wait_until=ec.presence_of_element_located((By.XPATH, '//h3[@class="titbox titbox1"]')))
    
    def parse_detail(self, response):
        # 指定默认命名空间
        namespaces = {'xmlns': 'http://www.w3.org/1999/xhtml'}

        jd_item = JournalsDetailItem()
        jd_item['name'] = response.xpath('//xmlns:h3[@class="titbox titbox1"]/text()', namespaces=namespaces)\
                            .extract_first().strip()
        jd_item['institute'] = response.xpath('//xmlns:ul[@id="JournalBaseInfo"]/li/span/text()', namespaces=namespaces)\
                                .extract_first().strip()
        jd_item['total_publications'] = response.xpath('//xmlns:ul[@id="publishInfo"]/li/span/text()', namespaces=namespaces)\
                                            .extract_first().strip()

        yield jd_item
    
    def close(self, reason):
        self.driver.quit()

希望可以帮到您！