博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python 爬取外文期刊论文信息(机械 仪表工业)
阅读量:6453 次
发布时间:2019-06-23

本文共 6268 字,大约阅读时间需要 20 分钟。

NSTL国家科技图书文献中心    2017  机械 仪表工业  所有期刊论文信息

代码比较随意,不要介意

第一步,爬取所有期刊链接

#coding=utf-8import timefrom selenium import webdriverfrom lxml import etreefrom pymongo import MongoClientclient = MongoClient("IP", 27017)db = client["nstl"]collection=db["journal_urls"]db.authenticate("","") driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")driver.get('https://www.nstl.gov.cn/facade/search/clcSearch.do?&lan=eng&clc=TH')html = driver.page_sourcetree = etree.HTML(html)count = int(tree.xpath("//span[@id='totalPages1']/text()")[0])# 共47页for i in range(count):    html = driver.page_source    tree = etree.HTML(html)    # 提取当前页所有期刊链接并存储    table = tree.xpath("//div[@class='s2listtd2']/span/a/@href")    for j in table:        bson = {}        bson['url'] = j        collection.insert(bson)    # i等于46时终止    if i==(count-1):        break    # 点击接下来一页按钮    driver.find_element_by_xpath('//div[@id="page"]/div//a[text()="%s"]'%str(i+2)).click()    # 判断翻页成功后跳出while    while True:        time.sleep(1)        if driver.page_source!=html:            breakdriver.close()

第二步,爬取每个期刊中所有2017年论文链接

#coding=utf-8import requestsfrom pymongo import MongoClientfrom lxml import etreefrom selenium import webdriverimport timeclient = MongoClient("IP", 27017)db = client["nstl"]collection1=db["journal_urls"]collection2=db["journalArticle2017_urls"]db.authenticate("","")driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")# 循环所有期刊链接for item in collection1.find({}, {
"url":1, "_id":0}): driver.get(item['url'][29:-4]) html = driver.page_source tree = etree.HTML(html) # 判断如果有18年论文,需要点击出17年论文 table_2018 = tree.xpath("//div[@id='year_2018']") if table_2018!=[]: driver.find_element_by_xpath("//div[@id='year_2017']").click() time.sleep(1) driver.find_element_by_xpath("//div[@id='volumeUl_2017']/div[@class='ltreebom2']").click() # 获取17年期的个数并循环 table = tree.xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3']/a") for i in range(1, len(table)+1): wen_html = driver.page_source wen_tree = etree.HTML(wen_html) # 获取当前一期的所有论文链接 wen_table = tree.xpath("//div[@class='s2listtd2']/a/@href") for j in wen_table: bson = {} bson['url'] = j collection2.insert(bson) # 判断结束循环 if i==len(table): break # 点击出下一期论文 try: driver.find_element_by_xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3'][%s]"%str(i+1)).click() except: break # 判断是否点击成功 while True: time.sleep(1) if driver.page_source!=wen_html: breakdriver.close()

第三步,爬取论文信息详情页源码

#coding=utf-8import requestsfrom pymongo import MongoClientfrom lxml import etreefrom selenium import webdriverimport timeclient = MongoClient("IP", 27017)db = client["nstl"]collection=db["journalArticle2017_urls"]collection1=db["journalArticle2017_codes"]db.authenticate("","")driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")# 循环所有论文并构造链接for item in collection.find({}, {
"url":1, "_id":0}): url = "https://www.nstl.gov.cn/facade/search/toFullView.do?checkedSEQNO="+item['url'][23:-11]+"&subDocType="+item['url'][-8:-3] # # post方法获取当前页源码 # for i in range(100): # try: # result = requests.post(url, verify = False) # except: # time.sleep(1) # continue # html = result.text # if html: # break # 模拟浏览器获取源码, 得到含有文献数据的源码后跳出循环 driver.get(url) for i in range(100): time.sleep(1) if driver.page_source!=html: break # 存储 bson = {} html1 = driver.page_source bson['html'] = html1 collection1.insert(bson)driver.close()

第四步,解析源码

#coding=utf-8from pymongo import MongoClientfrom lxml import etreeclient = MongoClient("IP", 27017)db = client["nstl"]collection1 = db["journalArticle2017_codes"]collection2 = db["journalArticle2017_data"]db.authenticate("","")zzdw, km, ma, cbn, j, q, qy, zy, zys, flh, gjc, yz, wz = u'【作者单位】:', u'【刊名】:', u'【ISSN】:', u'【出版年】:', u'【卷】:', u'【期】:', u'【起页】:', u'【止页】:', u'【总页数】:', u'【分类号】:', u'【关键词】:', u'【语种】:', u'【文摘】:'# 循环所有论文并构造链接n = 0for item in collection1.find({}, {
"html":1, "_id":0}): html = item["html"] tree = etree.HTML(html) title = tree.xpath("//span[@name='title']/text()") author = tree.xpath("//a[starts-with(@href,'javascript:searchByAuthor')]/text()") organization = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zzdw) journal_name = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%km) issn = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%ma) publication_year = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%cbn) volume = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%j) issue = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%q) page_start = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%qy) page_end = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zy) page_count = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zys) clc = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%flh) keywords = tree.xpath("//div[text()='%s']/following-sibling::*/span/a/text()"%gjc) language = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%yz) summary = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%wz) dc = {} dc['title'] = title[0] if author: dc['author'] = author if organization: dc['organization'] = organization[0] if journal_name: dc['journal_name'] = journal_name[0] if issn: dc['issn'] = issn[0] if publication_year: dc['publication_year'] = publication_year[0] if volume: dc['volume'] = volume[0] if issue: dc['issue'] = issue[0] if page_start: dc['page_start'] = page_start[0] if page_end: dc['page_end'] = page_end[0] if page_count: dc['page_count'] = page_count[0] if clc: dc['clc'] = clc[0] if keywords: dc['keywords'] = keywords[0] if language: dc['language'] = language[0] if summary: dc['summary'] = summary[0] collection2.insert(dc)

 

转载于:https://www.cnblogs.com/zhangtianyuan/p/9199324.html

你可能感兴趣的文章
阿里Java完整学习资料
查看>>
建立本地repo 管理仓库
查看>>
.Net转Java自学之路—基础巩固篇十(异常)
查看>>
详解redis服务
查看>>
数据结构之--单链表MyArrayList
查看>>
java基础练习2
查看>>
文件操作总结
查看>>
两队选手每队5人进行一对一的比赛(算法)
查看>>
eclipse : Error while performing database login with the driver null
查看>>
【语法】数组Array
查看>>
WebBrowser一点心得,如果在Javascript和Winform代码之间实现双向通信
查看>>
vue elementUI之Form表单 验证
查看>>
Android程序完全退出的三种方法
查看>>
依赖注入和控制反转
查看>>
权限体系构建 - 平台权限
查看>>
线性表5 - 数据结构和算法10
查看>>
电子脉冲示例图
查看>>
通过cmp 指令执行后,相关标志位的值就可以看出比较的结果
查看>>
变量的原理
查看>>
Poj 1017 / OpenJudge 1017 Packets/装箱问题
查看>>