Python用python-docx抓取公众号文章写入word

el/2024/4/20 15:07:45

一、安装包

pip3 install python-docx

二、了解python-docx

from docx import Document
from docx.shared import Inchesdocument = Document()#添加标题,并设置级别,范围:0 至 9,默认为1
document.add_heading('Document Title', 0)#添加段落,文本可以包含制表符(\t)、换行符(\n)或回车符(\r)等
p = document.add_paragraph('A plain paragraph having some ')
#在段落后面追加文本,并可设置样式
p.add_run('bold').bold = True
p.add_run(' and some ')
p.add_run('italic.').italic = Truedocument.add_heading('Heading, level 1', level=1)
document.add_paragraph('Intense quote', style='Intense Quote')#添加项目列表(前面一个小圆点)
document.add_paragraph('first item in unordered list', style='List Bullet'
)
document.add_paragraph('second item in unordered list', style='List Bullet')#添加项目列表(前面数字)
document.add_paragraph('first item in ordered list', style='List Number')
document.add_paragraph('second item in ordered list', style='List Number')#添加图片
document.add_picture('monty-truth.png', width=Inches(1.25))records = ((3, '101', 'Spam'),(7, '422', 'Eggs'),(4, '631', 'Spam, spam, eggs, and spam')
)#添加表格:一行三列
# 表格样式参数可选:
# Normal Table
# Table Grid
# Light Shading、 Light Shading Accent 1 至 Light Shading Accent 6
# Light List、Light List Accent 1 至 Light List Accent 6
# Light Grid、Light Grid Accent 1 至 Light Grid Accent 6
# 太多了其它省略...
table = document.add_table(rows=1, cols=3, style='Light Shading Accent 2')
#获取第一行的单元格列表
hdr_cells = table.rows[0].cells
#下面三行设置上面第一行的三个单元格的文本值
hdr_cells[0].text = 'Qty'
hdr_cells[1].text = 'Id'
hdr_cells[2].text = 'Desc'
for qty, id, desc in records:#表格添加行,并返回行所在的单元格列表row_cells = table.add_row().cellsrow_cells[0].text = str(qty)row_cells[1].text = idrow_cells[2].text = descdocument.add_page_break()#保存.docx文档
document.save('demo.docx')

在这里插入图片描述

三、公众号写入word

# -*- coding:utf-8 -*- 
"""
Author:SPIDERMAN
Time: 2021/2/8 
Software: PyCharm
"""
import time
from scrapy import Selector
import re
from docx import Document
from docx.shared import Inches
import requests
from docx.oxml.ns import qn
from docx.shared import Pt,RGBColor
header = {"Host": "mp.weixin.qq.com","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2","Accept-Encoding": "gzip, deflate, br","Connection": "keep-alive"
}def get_biz_html(url):"""获取公众号的页面:param url: :return: """res = requests.get(url=url,timeout=8,headers=header).textcontents = Selector(text=res).css('#js_content')re_title = re.compile('property="og:title".*?content="(.*?)"',re.S)re_time = re.compile(',n="(.*?)",',re.S)re_img = re.compile('property="og:image".*?content="(.*?)".*?/>')re_des = re.compile('name="description".*?content="(.*?)".*?/>')try:#构建存储字典item = {}#公众号标题title = re.findall(re_title,res)[0].strip()#公众号作者source = Selector(text=res).css('#js_name::text').extract_first().strip()#公众号链接url = url#公众号发布时间news_time = re.findall(re_time,res)[0]if news_time:news_time=time.strftime(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(int(news_time))))#公众号封面newsImage = re.findall(re_img,res)[0].strip()try:newsDes = re.findall(re_des,res)[0].strip()except:newsDes = ''content = contents.extract_first()get_html_item(title,content,source,news_time)except Exception as e:print('[Exception]:'+str(e))passdef get_html_item(title,contents,source,news_time):"""将文章正文分段:param title: :param contents: :param source: :param news_time: :return: """print('[INFO]:'+title)print('[INFO]:'+contents)content_list =[]content_set = set()contents = Selector(text=contents).css('#js_content *')item = {}for content in contents:# print(content.extract())if 'img' in content.extract():sub_txt = sub_html(content.extract())if sub_txt:sub_txt = sub_txt.strip()if sub_txt not in content_set:content_set.add(sub_txt)text = sub_txt.strip()content_list.append({'text':text})img = content.css('img::attr(data-src)').extract_first()if img and img not in content_set:content_list.append({'img': img.strip()})content_set.add(img)else:sub_txt = sub_html(content.extract())if sub_txt:sub_txt = sub_txt.strip()if sub_txt not in content_set:content_set.add(sub_txt)text = sub_txt.strip()content_list.append({'text': text})print('[INFO]:html_to_word')html_to_word(title,content_list,source,news_time)def sub_html(text):"""替换标签:param text: :return: """text = re.sub(re.compile('<.*?>'),'',text)return text
def html_to_word(title,contents,source,news_time):"""写入wrod:param title: :param contents: :param source: :param news_time: :return: """document = Document()document.styles['Normal'].font.name = u'宋体'document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')document.styles['Normal'].font.size = Pt(10.5)document.styles['Normal'].font.color.rgb = RGBColor(0, 0, 0)#添加标题,并设置级别,范围:0 至 9,默认为1document.add_heading(title, 0)#添加段落,文本可以包含制表符(\t)、换行符(\n)或回车符(\r)等document.add_paragraph('{source}                     {news_time} '.format(source=source,news_time=news_time)).italic = Truefor content in contents:try:img =content['img']requset_img(img)document.add_picture('1.jpg', width=Inches(5))except:text = content['text']document.add_paragraph(text)document.add_page_break()#保存.docx文档document.save('{}.docx'.format(title))print('[INFO]:{}.docx is ok'.format(title))# document.save('{}.docx'.format(title))# print('{}.docx is ok'.format(title))def img_parse(response):with open('1.jpg', 'wb') as img:img.write(response)def requset_img(url):response = requests.get(url=url)img_parse(response.content)if __name__ == '__main__':get_biz_html('https://mp.weixin.qq.com/s/SR7-VuRNH4h8W2MrluSjiA')

同学们可以自己运行一下

console.log("公众号:虫术")
console.log("wx:spiderskill")
欢迎大家前来交流

http://www.ngui.cc/el/3376923.html

相关文章

flask给使用模板给js传参

用echarts时候发现eval处理传过来的值js里会报错&#xff0c;采用以下方式传值。 data: {{ dt_list | safe }}完美解决

python2忽略ssl证书验证问题

问题&#xff1a; requests.exceptions.SSLError: HTTPSConnectionPool(host*****, port****)解决办法&#xff1a; import urllib3 urllib3.disable_warnings() #verifyFalse requests.post(urlurl, paramsparams, headersheaders, datapost_params,verifyFalse)

MagiskHide android ro.debuggable属性调试修改

如果想要调试android 的程序,以下两个条件满足一个就行。 1、是apk的配置文件内的AndroidManifest.xml的 android:debuggable=”true”。 2、是修改/default.prop中ro.debuggable=1。 第一种通常是解包添加属性再打包,随着加壳软件以及apk校验等,容易出现安装包异常。 第…

“WCHAR *“ 类型的实参与 “LPSTR“ 类型的形参不兼容

在VS2013编译器中直接输入的字符串常量&#xff08;如“abc”&#xff09;默认是以const char *的格式&#xff08;即ANSI编码&#xff09;储存的&#xff0c;因此会导致类型不匹配的编译错误。 方法是右击“解决方案资源管理器”中的项目&#xff0c;“属性→配置属性→常规→…

将抓包证书推至安卓系统目录

1、找到证书文件 windows目录: C:\Users\用户名.mitmproxy&#xff0c;并将mitmproxy-ca-cert.pem 复制一份重命名为&#xff1a;c8750f0d.0 。 2、传入手机系统目录并给予权限(root权限下 #1 推至sdcard目录 adb push c8750f0d.0 /sdcard#2 切换至证书目录并挂载系统目录为…

C++ HttpWebRequest发送POST请求,参数以json格式传递

string PostUrl "URL";JObject patientinfo new JObject();patientinfo["name"] "TOM";patientinfo["age"] 12;string sendData JsonConvert.SerializeObject(patientinfo);//eg: 发送Url需要的格式&#xff1a;sendData{"…

Python根据关键词在360、百度、bing搜索下载图片

DownloaderImage 项目地址:https://github.com/404SpiderMan/DownloadImage&#xff08;求star&#xff09; 1. 简介 输入一组关键词&#xff0c;指定所需图片个数&#xff0c;在常见搜索引擎中检索&#xff0c;记录图片url地址&#xff0c;并将图片保存在指定目录下。 百度…

PC微信获取朋友圈数据api接口

采用http通信方便调用 获取首页数据: http://10.222.6.70:6688/get?page-1翻页&#xff1a; http://10.222.6.70:6688/get?page13666459078131650946数据包括发布时间、username、wxid、点赞等数据 交流体验获取请联系 console.log("wx:spiderskill") 欢迎关注…

Frida与Frida-tools对应关系

查询网址&#xff1a;https://github.com/frida/frida/releases 在对应版本Frida中查找Frida-tools版本即可

Kali安装IDA Pro

┌──(root)-[~/Desktop/IDA_Pro_v6.4] └─# ldd idaq64 linux-gate.so.1 (0xf7f2b000)libida64.so > /root/Desktop/IDA_Pro_v6.4/./libida64.so (0xf7c68000)libQtXml.so.4 > /root/Desktop/IDA_Pro_v6.4/./libQtXml.so.4 (0xf7c11000)libQtGui.so.4 > /root/Desk…