Python爬虫自动化获取华图和粉笔网站的错题(推荐)

脚本专栏 2024/11/17 佚名

2 0 1

神剑山庄资源网 Design By www.hcban.com

这篇博客对于考公人或者其他用华图或者粉笔做题的人比较友好，通过输入网址可以自动化获取华图以及粉笔练习的错题。

粉笔网站

我们从做过的题目组中获取错题

打开某一次做题组，我们首先进行抓包看看数据在哪里

我们发现现在数据已经被隐藏，事实上数据在这两个包中：
https://tiku.fenbi.com/api/xingce/questions
https://tiku.fenbi.com/api/xingce/solutions
一个为题目的一个为解析的。此url要通过传入一个题目组参数才能获取到当前题目数据，而题目组参数在这个包中

以网址的倒数第二个数字串有关

url的规则为'https://tiku.fenbi.com/api/xingce/exercises/'+str(id_)+'"htmlcode">


https://tiku.fenbi.com/api/xingce/questions
https://tiku.fenbi.com/api/xingce/solutions



）即可获取到题目数据，而且自己的答案在也在https://tiku.fenbi.com/api/xingce/exercises/'+str(id_)+'"htmlcode">

###此函数用于解析题目和每道题的答案
def jiexi(liebiao):
 new = []
 timu_last = []
 for each in liebiao:
  new.append(re.sub(r'flag=\\"tex\\" ','',each))
 for each in new:
  timu_last.append(re.sub(r'\\','',each))
 return timu_last
###此函数用于解析选项
def xuanxiang(liebiao):
 xuanxiang_v2 = []
 xuanxiang_v3 = []
 for each in liebiao:
  a = re.sub('<p>','',each)
  a = re.sub('</p>','',a)
  xuanxiang_v2.append(a)
 for each in xuanxiang_v2:
  each = each+'</p>'
  xuanxiang_v3.append(each)
 return xuanxiang_v3
import requests
import re
import pdfkit
import os
url = str(input("请输入练习的网址："))
###获取本节练习id
id_ = re.findall(r'https://www.fenbi.com/spa/tiku.*"questionIds\"\:\[(.*"answer":{"choice":"(.*",',page_text,re.S)
###此练习名称
name = re.findall(r'"name":"(.*",',page_text,re.S)[0]
###真正存储数据的包
timu_url = 'https://tiku.fenbi.com/api/xingce/questions'
params = {
 'ids': id_list
}
response = requests.get(url=timu_url,headers=headers,params=params)
response.encoding = 'utf-8'
page_text = response.text
###获取正确答案
true_answer = re.findall('"correctAnswer":{"choice":"(.*"',page_text,re.S)
###真正存储数据的包
solution_url = 'https://tiku.fenbi.com/api/xingce/solutions'
response = requests.get(url=solution_url,headers=headers,params=params)
response.encoding = 'utf-8'
page_text = response.text
###获取解析
solution_list = re.findall(r'"solution":"(.*","userAnswer"',page_text,re.S)
solution_last = jiexi(solution_list)
cailiao = []
timu = []
###获取单选题题目和复合题的题目
for each in response.json():
 timu.append(each['content'])
 try:
  cailiao.append(each['material']['content'])
 except:
  cailiao.append('none')
###获取选项信息
A_option = re.findall('\"options\"\:\[\"(.*"\,\".*"\,\".*"\,\".*"\]',page_text,re.S)
B_option = re.findall('\"options\"\:\[\".*"\,\"(.*"\,\".*"\,\".*"\]',page_text,re.S)
C_option = re.findall('\"options\"\:\[\".*"\,\".*"\,\"(.*"\,\".*"\]',page_text,re.S)
D_option = re.findall('\"options\"\:\[\".*"\,\".*"\,\".*"\,\"(.*"\]',page_text,re.S)
A_option = xuanxiang(A_option)
B_option = xuanxiang(B_option)
C_option = xuanxiang(C_option)
D_option = xuanxiang(D_option)
A_option = jiexi(A_option)
B_option = jiexi(B_option)
C_option = jiexi(C_option)
D_option = jiexi(D_option)
###构造HTML代码
count = 0
all_content = "<!DOCTYPE html>\n<meta charset='utf-8'>\n<html>"
for each in true_answer:
 if each != your_answer[count]:
  ###处理复合题
  if cailiao[count] != 'none' and cailiao[count] not in all_content:
   all_content += cailiao[count]
  all_content += str(count+1)
  all_content += '、'
  all_content += timu[count][3:]
  all_content += 'A、'
  all_content += A_option[count]
  all_content += 'B、'
  all_content += B_option[count]
  all_content += 'C、'
  all_content += C_option[count]
  all_content += 'D、'
  all_content += D_option[count]
  all_content += '<br>'
 count += 1
count = 0
all_content += '<br><br><br><br><br><br><br><br><br>'
for each in true_answer:
 if each != your_answer[count]:
  temp = '第'+str(count+1)+'题的正确答案为'
  all_content += temp
  if true_answer[count]=='0':
   all_content += 'A'
  elif true_answer[count]=='1':
   all_content += 'B'
  elif true_answer[count]=='2':
   all_content += 'C'
  elif true_answer[count]=='3':
   all_content += 'D'
  all_content += solution_last[count]
  all_content += '<br>'
 count += 1
all_content += '</html>'
path_name = name + '.html'
###保存为HTML文件
with open(path_name,'w',encoding='utf-8') as fp:
 fp.write(all_content)
confg = pdfkit.configuration(wkhtmltopdf=r'wkhtmltopdf.exe保存的路径')
pdfkit.from_url(path_name, name+'.pdf',configuration=confg)###把HTML文件转为pdf
print('错题PDF保存成功')
###删除HTML文件
os.remove(path_name)


华图网站

也是答题记录中自己做过的题目

华图网站稍微不一样，他的数据直接抓包就可看到

通过请求这个包即可获取到数据，接下来就是解析的事情了，这次我用word文档进行存储，如果觉得不方便也可以像上文一样构造HTML


##导包
import requests
import lxml.etree
import re
import time
import os
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.enum.text import WD_ALIGN_PARAGRAPH
url = str(input("请输入练习的网址："))
headers={
###完整的headers，否则获取不到数据
}
response = requests.get(url = url,headers = headers)
response.encoding='utf-8'
reptext = response.text
tree = lxml.etree.HTML(reptext) #解析网站获取源码

dirName="考公图片"
if not os.path.exists(dirName):
 os.mkdir(dirName) #网站图片保存路径
 
jiexi = re.findall(r'<div class="jiexi-item-title">解析.*"(.*".*?>', each)) #获取解析里的图片URL
 
imgt = []
for each in imgg:
 if each == []:
  imgt.append([1])
 else:
  imgt.append(each) #把解析里图片URL美化整理一下
  
jiexilast = []
for qq in jiexi:
 jiexilast.append(re.sub(r'<[^>]+>', '', qq)) #美化题目解析
 
corrected = re.findall(r'<span class="g-right-answer-color">[a-zA-Z]{1,4}</span>', reptext) #获取正确答案
correct = []
for ee in corrected:
 correct.append(re.sub(r'<[^>]+>', '', ee)) #美化正确答案
 
yoursed = re.findall(r'<span class="yellowWord">[a-zA-Z]{1,4}</span>', reptext) #获取自己的答案
yours = []
for ee in yoursed:
 yours.append(re.sub(r'<[^>]+>', '', ee)) #美化自己的答案
 
timuleixing = re.findall(r'<span class="greenWord">(.*"greenWord">.*"(.*".*?>', each))
imgx = []
for each in img:
 if each == []:
  imgx.append([1])
 else:
  imgx.append(each) #最终版题目图片URL
  

v = tree.xpath('//div[@class="exercise-main-title"]//text()') #本次题目类型

try:
 ###这是既有复合题也有单选题的
 fuheti = re.findall(r'<!--复合题-->(.*"exercise-main-topics"',reptext,re.S)[0].split('<!--复合题-->')
except:
 try:
  ###这是只有复合题或者复合题在最后几题的
  fuheti = re.findall(r'<!--复合题-->(.*"main-topic-choices">(.*"main-topic-letters clearfix pl14">',reptext,re.S)
for everything in xuanxiang:
 try: ##处理只有两个选项
  axuanxiang.append(re.sub("<.*?>","",re.findall(r'<div.*"<.*?>","",re.findall(r'<div.*"<.*?>","",re.findall(r'<div.*"<.*?>","",re.findall(r'<div.*"(.*".*?>',re.findall(r'.*"保存成功")
       document.add_picture(img_path, width=Inches(5))
      choose.append(fuheti_last)
  except:
   pass
  
  ###导入单选题题目
  p = document.add_paragraph()
  run = p.add_run(str(count+1)+"、"+timuleixing[count]+find5[count][3:])
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  url = imgx[count][0]
  headers ={
   'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
  }
  try:
   img_data = requests.get(url = url,headers = headers).content
   img_path = dirName+'/'+'tupian'+'.jpg'
   with open(img_path,'wb') as fp:
    fp.write(img_data)
    print("保存成功")
   document.add_picture(img_path, width=Inches(5))
   count+=1
  except:
   count+=1
   
  ###导入选项
  p = document.add_paragraph()
  run = p.add_run(axuanxiang[count-1])
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  p = document.add_paragraph()
  run = p.add_run(bxuanxiang[count-1])
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  p = document.add_paragraph()
  run = p.add_run(cxuanxiang[count-1])
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  p = document.add_paragraph()
  run = p.add_run(dxuanxiang[count-1])
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  p = document.add_paragraph()
  run = p.add_run("\n")
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  
 else:
  count+=1

###美化界面
p = document.add_paragraph()
run = p.add_run("\n\n\n\n\n")
run.font.size = Pt(14)
run.font.name=u'宋体'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')

###美化解析
counting = 0
jiexilast2 = []
for ok in jiexilast:
 jiexilast2.append(re.sub(r'\n\t\t','：',ok))
for every in correct:
 if every != yours[counting]:
  ###导入解析和答案
  p = document.add_paragraph()
  run = p.add_run(str(counting+1)+"、"+"正确答案为："+correct[counting]+"\n"+jiexilast2[counting])
  run.font.size = Pt(14)
  run.font.name=u'宋体'
  r = run._element
  r.rPr.rFonts.set(qn('w:eastAsia'),u'宋体')
  url = imgt[counting][0]
  headers ={
   'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
  }
  try:
   img_data = requests.get(url = url,headers = headers).content
   img_path = dirName+'/'+'tupian'+'.jpg'
   with open(img_path,'wb') as fp:
    fp.write(img_data)
    print("保存成功")
   document.add_picture(img_path, width=Inches(5))
   print("写入成功")
   counting+=1
  except:
   counting+=1
 else:
  counting+=1
###保存文档
document.save(v[0][5:-3]+'.docx')
print(v[0][5:-3]+'保存成功！')


总结
粉笔和华图错题爬虫主要区别是华图获取数据简单，解析操作繁琐；粉笔的数据隐秘，解析起来可以用json，比较方便。


                                
                                    Python爬虫获取网站错题,Python爬虫自动化获取 
                                
                                    标签：
                                        Python爬虫获取网站错题,Python爬虫自动化获取
                                     

                                神剑山庄资源网 Design By www.hcban.com


                        
                            
                                神剑山庄资源网
                                免责声明：本站文章均来自网站采集或用户投稿，网站不提供任何软件下载或自行开发的软件！
                                如有用户或公司发现本站内容信息存在侵权行为，请邮件告知！ 858582#qq.com
                            
                        
                        
                            
                                上一篇
                                利用python查看数组中的所有元素是否相同
                            
                            
                                下一篇
                                tensorflow与numpy的版本兼容性问题的解决
                            
                        
                        
                        神剑山庄资源网 Design By www.hcban.com
                        
                            
                                
                                
                                    评论“Python爬虫自动化获取华图和粉笔网站的错题(推荐)”
                                
                            
                            
                                
                                    
                                        
                                            
                                                
                                                    
                                                
                                                
                                                    
                                                
                                                
                                                    
                                                    
                                                    
                                                
                                                
                                                     再想想
                                                    
                                                    
                                                    
                                                    
                                                    
                                                
                                            
                                            
                                        
                                    
                                    
                                    
                                        暂无Python爬虫自动化获取华图和粉笔网站的错题(推荐)的评论...


                    
                        
                            
                                
                                    
                                        
                                    
                                    
                                        
                                            
                                        
                                    
                                
                                
                                    www.hcban.com
                                            
                                                神剑山庄资源网 
                                    
                                    
                                        
                                            
                                        
                                        
                                            
                                        
                                        
                                            
                                        
                                        
                                            
                                        
                                    
                                    
                                    
                                        
                                            139,976影音资源
                                        
                                        
                                            144,792福利资源
                                        
                                        
                                            21,817软件资源
                                        
                                        
                                            631,128技术资源
                                        
                                    
                                
                            
                            
                                最新文章
                                
                                    
                                         
                                       
                                            
                                                
                                            
                                            
                                                
                                                    4complete《丛生》[320K/MP3][85.26MB]
                                                
                                                
                                                    
                                                        2024/11/17
                                                        
                                                         27
                                                    
                                                
                                            
                                        
 
                                       
                                            
                                                
                                            
                                            
                                                
                                                    4complete《丛生》[FLAC/分轨][218.01MB]
                                                
                                                
                                                    
                                                        2024/11/17
                                                        
                                                         3
                                                    
                                                
                                            
                                        
 
                                       
                                            
                                                
                                            
                                            
                                                
                                                    羽泉《给未来的你&天黑天亮》[WAV+CUE]
                                                
                                                
                                                    
                                                        2024/11/17
                                                        
                                                         29
                                                    
                                                
                                            
                                        
 
                                       
                                            
                                                
                                            
                                            
                                                
                                                    庄心妍《我也许在等候》[低速原抓WAV+CUE]
                                                
                                                
                                                    
                                                        2024/11/17
                                                        
                                                         56
                                                    
                                                
                                            
                                        
 
                                       
                                            
                                                
                                            
                                            
                                                
                                                    王雅洁《小调歌后2》[原抓WAV+CUE]
                                                
                                                
                                                    
                                                        2024/11/17
                                                        
                                                         5
                                                    
                                                
                                            
                                        


                                    
                                
                            
                            站点导航
抖音极速版河马剧场京东小红书微信高德地图红果短剧夸克美团剪映拼多多支付宝淘宝快手QQ哔哩哔哩番茄小说得物阿里巴巴王者荣耀和平精英腾讯视频爱奇艺QQ音乐咸鱼之王逆水寒三国志战略版梦幻西游金铲铲之战捕鱼大作战原神英雄联盟手游网易云音乐崩坏星穹铁道优酷视屏酷狗音乐蛋仔派对

Python爬虫自动化获取华图和粉笔网站的错题(推荐)

粉笔网站

华图网站

总结

Python爬虫获取网站错题,Python爬虫自动化获取

利用python查看数组中的所有元素是否相同

tensorflow与numpy的版本兼容性问题的解决

评论“Python爬虫自动化获取华图和粉笔网站的错题(推荐)”

《魔兽世界》大逃杀！60人新游玩模式《强袭风暴》3月21日上线

更新日志

友情链接