帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据
1.设计数据库
from django.db import models from uuslug import slugify import uuid import os def products_directory_path(instance, filename): ext = filename.split('.')[-1] filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # return the whole path to the file return os.path.join('images', "products", instance.title, filename) def product_relatedimage_directory_path(instance, filename): ext = filename.split('.')[-1] filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # return the whole path to the file return os.path.join('images', "product_relatedimage", instance.product.title, filename) class ProductsCategory(models.Model): """产品分类""" name = models.CharField('产品分类名', max_length=80, unique=True) description = models.TextField('产品分类描述', blank=True, null=True) slug = models.SlugField('slug', max_length=80, blank=True, null=True) parent_category = models.ForeignKey('self', verbose_name="父级分类", blank=True, null=True, on_delete=models.CASCADE) def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(self.name) super().save(*args, **kwargs) def __str__(self): return self.name class Meta: ordering = ['name'] verbose_name = "产品分类" verbose_name_plural = verbose_name class ProductsTag(models.Model): """产品标签""" name = models.CharField('产品标签名', max_length=30, unique=True) slug = models.SlugField('slug', max_length=40) def __str__(self): return self.name def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(self.name) super().save(*args, **kwargs) class Meta: ordering = ['name'] verbose_name = "产品标签" verbose_name_plural = verbose_name class Product(models.Model): title = models.CharField('标题', max_length=255, unique=True) slug = models.SlugField('slug', max_length=255, blank=True, null=True) jscs = models.TextField('技术参数', blank=True, null=True) image = models.ImageField(upload_to=products_directory_path, verbose_name="产品图片") views = models.PositiveIntegerField('浏览量', default=0) category = models.ForeignKey('ProductsCategory', verbose_name='分类', on_delete=models.CASCADE, blank=True, null=True) tags = models.ManyToManyField('ProductsTag', verbose_name='标签集合', blank=True) def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(self.title) super().save(*args, **kwargs) def update_views(self): self.views += 1 self.save(update_fields=['views']) def get_pre(self): return Product.objects.filter(id__lt=self.id).order_by('-id').first() def get_next(self): return Product.objects.filter(id__gt=self.id).order_by('id').first() def __str__(self): return self.title class Meta: verbose_name = "产品" verbose_name_plural = verbose_name class ProductAdvantage(models.Model): content = models.TextField('产品优势', blank=True, null=True) product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True) def __str__(self): return self.content class Meta: verbose_name = "产品优势" verbose_name_plural = verbose_name class ProductBody(models.Model): body = models.CharField('产品内容', max_length=256, blank=True, null=True) product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True) def __str__(self): return self.product.title class Meta: verbose_name = "产品内容" verbose_name_plural = verbose_name
2.脚本编写
2.1编写获取网页源代码函数
def get_one_page(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} res = requests.get(url=url, headers=headers) res.encoding = 'utf-8' if res.status_code == 200: return res.text else: return None except Exception: return None
2.2根据base页面获取所有产品分类页面链接
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href') # 处理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url print(url)
2.3根据产品分类页面链接获取对应所有产品链接
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 产品分类 catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()') print("产品分类:" + catgory[0]) # 该分类下产品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 处理url for url in urls: url = 'http://www.kexinjianji.com' + url print(url) print("=====================================================")
两者结合起来就可以打印出所有产品链接
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href') # 处理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) tree = etree.HTML(content) # 产品分类 catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()') print("产品分类:" + catgory[0]) # 该分类下产品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 处理url for url in urls: url = 'http://www.kexinjianji.com' + url print(url) print("=====================================================")
2.2使用xpath解析函数返回产品链接的内容
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 产品名称 title = tree.xpath('//*[@id="wrap"]//h1/text()') images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src') # 产品图片 images_url = 'http://www.kexinjianji.com/' + images[0] # 性能特点 xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()') # 技术参数 jscs = tree.xpath('//table')[0] jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8') # 产品内容 cpnr = tree.xpath('//div[@class="describe"]/p') print('产品名称:' + title[0]) print('产品图片:' + images_url) for td in xntd: print('性能特点:' + td) print('技术参数:' + jscs_str) for cp in cpnr: # string(.) 获取当前标签下所有文本内容 cp = cp.xpath('string(.)') print('产品内容:' + cp) print('============================================')
将三者结合在一起就可以获取所有产品信息
if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href') # 处理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) tree = etree.HTML(content) # 产品分类 catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()') # 该分类下产品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 处理url for url in urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) try: tree = etree.HTML(content) # 产品名称 title = tree.xpath('//*[@id="wrap"]//h1/text()') images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src') # 产品图片 images_url = 'http://www.kexinjianji.com' + images[0] # 性能特点 xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()') # 技术参数 jscs = tree.xpath('//table')[0] jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8') # 产品内容 cpnr = tree.xpath('//div[@class="describe"]/p') print("产品分类:" + catgory[0]) print('产品链接:' + url) print('产品名称:' + title[0]) print('产品图片:' + images_url) for td in xntd: print('性能特点:' + td.strip()) # print('技术参数:' + jscs_str) for cp in cpnr: # string(.) 获取当前标签下所有文本内容 cp = cp.xpath('string(.)') print('产品内容:' + cp) print('============================================') except Exception as e: print(e) print('出错url:' + url) pass
3.存储到django模型
import requests from lxml.html import etree import os import django import uuid from django.core.files.base import ContentFile os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings") django.setup() from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage url = 'http://www.kexinjianji.com/product/hzshntjbz_1/' def get_one_page(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} res = requests.get(url=url, headers=headers, timeout=10) res.encoding = 'utf-8' if res.status_code == 200: return res.text else: return None except Exception: print('aa') return None if __name__ == '__main__': content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href') # 处理catgory_urls for url in catgory_urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) tree = etree.HTML(content) # 产品分类 p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()') # 该分类下产品url urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href') # 处理url for url in urls: url = 'http://www.kexinjianji.com' + url content = get_one_page(url) try: tree = etree.HTML(content) # 产品名称 title = tree.xpath('//*[@id="wrap"]//h1/text()') images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src') # 产品图片 images_url = 'http://www.kexinjianji.com' + images[0] # 性能特点 xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()') # 技术参数 jscs = tree.xpath('//table')[0] jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8') # 产品内容 cpnr = tree.xpath('//div[@class="describe"]/p') # 判断是否有这分类,没有则新建 catgory = p_catgory[0] products_catgory = ProductsCategory.objects.filter(name=catgory).exists() if products_catgory: products_catgory = ProductsCategory.objects.get(name=catgory) else: products_catgory = ProductsCategory(name=catgory) products_catgory.save() print(products_catgory) # 保存产品图片 image_content = requests.get(url=images_url) ext = images_url.split('.')[-1] # 获取图片类型 filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # 随机生成图片名字 upload_image_file = ContentFile(image_content.content, name=filename) # 将图片保存为django类型 product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory) product.save() for td in xntd: product_advantage = ProductAdvantage() product_advantage.content = td product_advantage.product = product product_advantage.save() for cp in cpnr: cp = cp.xpath('string(.)') product_body = ProductBody() product_body.body = cp product_body.product = product product_body.save() except Exception as e: print(e) print('出错url:' + url)
最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)
4.总结
1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下
<div class="describe" style="position: relative;"> <p><span>板 宽:</span>1500mm</p> <p><span>板 厚:</span>4.5 mm</p> <p><span>出料口:</span>6口</p> <p><span>重 量:</span>6000 kg</p> </div>
使用xpath获取p标签内容
我想得到的效果如下
板 宽:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分开获取,不是想要的效果
//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()
百度之后找到的解决办法,使用xpath(‘string(.)')
1.先获取所有p标签
cpnr = tree.xpath('//div[@class="describe"]/p')
2.使用**string(.)**获取所有标签所有文本
cp = cp.xpath('string(.)')
循环遍历所有p标签即可
《魔兽世界》大逃杀!60人新游玩模式《强袭风暴》3月21日上线
暴雪近日发布了《魔兽世界》10.2.6 更新内容,新游玩模式《强袭风暴》即将于3月21 日在亚服上线,届时玩家将前往阿拉希高地展开一场 60 人大逃杀对战。
艾泽拉斯的冒险者已经征服了艾泽拉斯的大地及遥远的彼岸。他们在对抗世界上最致命的敌人时展现出过人的手腕,并且成功阻止终结宇宙等级的威胁。当他们在为即将于《魔兽世界》资料片《地心之战》中来袭的萨拉塔斯势力做战斗准备时,他们还需要在熟悉的阿拉希高地面对一个全新的敌人──那就是彼此。在《巨龙崛起》10.2.6 更新的《强袭风暴》中,玩家将会进入一个全新的海盗主题大逃杀式限时活动,其中包含极高的风险和史诗级的奖励。
《强袭风暴》不是普通的战场,作为一个独立于主游戏之外的活动,玩家可以用大逃杀的风格来体验《魔兽世界》,不分职业、不分装备(除了你在赛局中捡到的),光是技巧和战略的强弱之分就能决定出谁才是能坚持到最后的赢家。本次活动将会开放单人和双人模式,玩家在加入海盗主题的预赛大厅区域前,可以从强袭风暴角色画面新增好友。游玩游戏将可以累计名望轨迹,《巨龙崛起》和《魔兽世界:巫妖王之怒 经典版》的玩家都可以获得奖励。
更新日志
- 中国武警男声合唱团《辉煌之声1天路》[DTS-WAV分轨]
- 紫薇《旧曲新韵》[320K/MP3][175.29MB]
- 紫薇《旧曲新韵》[FLAC/分轨][550.18MB]
- 周深《反深代词》[先听版][320K/MP3][72.71MB]
- 李佳薇.2024-会发光的【黑籁音乐】【FLAC分轨】
- 后弦.2012-很有爱【天浩盛世】【WAV+CUE】
- 林俊吉.2012-将你惜命命【美华】【WAV+CUE】
- 晓雅《分享》DTS-WAV
- 黑鸭子2008-飞歌[首版][WAV+CUE]
- 黄乙玲1989-水泼落地难收回[日本天龙版][WAV+CUE]
- 周深《反深代词》[先听版][FLAC/分轨][310.97MB]
- 姜育恒1984《什么时候·串起又散落》台湾复刻版[WAV+CUE][1G]
- 那英《如今》引进版[WAV+CUE][1G]
- 蔡幸娟.1991-真的让我爱你吗【飞碟】【WAV+CUE】
- 群星.2024-好团圆电视剧原声带【TME】【FLAC分轨】