【2】文件读写-2-8-python读写docx
一、常规用法
安装:
pip install python-docx
1.1 读取内容
读取段落
import docx
docStr = Document(docName) #打开文档
for paragraph in docStr.paragraphs:
parStr = paragraph.text #每个段落的内容
paragraph.style.name == 'Heading 1' #一级标题
paragraph.paragraph_format.alignment == 1 #居中显示
paragraph.style.next_paragraph_style.paragraph_format.alignment == 1 #下一段居中显示
paragraph.style.font.color
读取表格
numTables = docStr.tables
for table in numTables: row_count = len(table.rows) col_count = len(table.columns) for i in range(row_count): row = table.rows[i].cells #i行j列内容:row[j].text
#或者:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
print(table.cell(i,j).text)
读取图片
import xml.etree.ElementTree as ET
from PIL import Image
from docx import Document
def hasImage(par):
"""get all of the images in a paragraph
:param par: a paragraph object from docx
:return: a list of r:embed
"""
ids = []
root = ET.fromstring(par._p.xml.encode('utf-8'))
namespace = {
'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
inlines = root.findall('.//wp:inline',namespace)
for inline in inlines:
imgs = inline.findall('.//a:blip', namespace)
for img in imgs:
id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
ids.append(id)
return ids
document = Document(one_word_fp)
for one_p in document.paragraphs
img_ids = hasImage(one_p)
if len(img_ids) != 0:
for one_id in img_ids:
# print(img_ids)
document_part = document.part
image_part = document_part.related_parts[one_id]
img_name = 'test.png'
fr = open(img_name, "wb")
fr.write(image_part._blob)
fr.close()
#因为IE只支持RGB模式的图片,转换一下
image = Image.open(img_name)
image = image.convert('RGB')
image.save(img_name)
1.2 写word
#coding=utf-8
from docx import Document
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
#打开文档
document = Document()
#加入不同等级的标题
document.add_heading(u'MS WORD写入测试',0)
document.add_heading(u'一级标题',1)
document.add_heading(u'二级标题',2)
#添加文本
paragraph = document.add_paragraph(u'我们在做文本测试!')
#设置字号
run = paragraph.add_run(u'设置字号、')
run.font.size = Pt(24)
#设置字体
run = paragraph.add_run('Set Font,')
run.font.name = 'Consolas'
#设置中文字体
run = paragraph.add_run(u'设置中文字体、')
run.font.name=u'宋体'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
#设置斜体
run = paragraph.add_run(u'斜体、')
run.italic = True
#设置粗体
run = paragraph.add_run(u'粗体').bold = True
#增加引用
document.add_paragraph('Intense quote', style='Intense Quote')
#增加无序列表
document.add_paragraph(
u'无序列表元素1', style='List Bullet'
)
document.add_paragraph(
u'无序列表元素2', style='List Bullet'
)
#增加有序列表
document.add_paragraph(
u'有序列表元素1', style='List Number'
)
document.add_paragraph(
u'有序列表元素2', style='List Number'
)
#增加图像(此处用到图像image.bmp,请自行添加脚本所在目录中)
document.add_picture('image.bmp', width=Inches(1.25))
#增加表格
table = document.add_table(rows=1, cols=3)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Name'
hdr_cells[1].text = 'Id'
hdr_cells[2].text = 'Desc'
#再增加3行表格元素
for i in xrange(3):
row_cells = table.add_row().cells
row_cells[0].text = 'test'+str(i)
row_cells[1].text = str(i)
row_cells[2].text = 'desc'+str(i)
#增加分页
document.add_page_break()
#保存文件
document.save(u'测试.docx')
三、讨论
3.1 写入颜色:
from docx import Document
from docx.shared import RGBColor
document = Document()
run = document.add_paragraph().add_run('some text')
font = run.font
font.color.rgb = RGBColor(0x42, 0x24, 0xE9)
p=document.add_paragraph('aaa')
document.save('demo1.docx')
四、我的应用:
4.1 获取段落的颜色
from docx import Document
document = Document(one_word_fp)
for one_para in document.paragraphs: #遍历每一个但罗
para_colors = []
for n in one_para.runs:
rgb_color = str(n.font.color.rgb)
para_colors.append(rgb_color)
4.2 doc to docx
yum list LibreOffi
yum install libreoffice.x86_64
import subprocess
output = subprocess.check_output(["soffice","--headless","--invisible","--convert-to","docx",one_file_fp,"--outdir","/data/user/sam/project/drug_news/1.analysis"])
参考资料:
这里是一个广告位,,感兴趣的都可以发邮件聊聊:tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn