本帖最后由 qq1151985918 于 2021-2-22 07:01 编辑
import docx
import openpyxl
file = docx.Document("(版本二)稍微处理过的.docx")
data = []
for para in file.paragraphs:
data.append(para.text)
text = "\n".join(data)
text = text.replace(" 微博问答","")
text = text.replace(" 微博头条","")
text = text.replace("拷贝地址一键复制","")
text = text.replace(" ","\n")
text = text.replace(" ","\n")
text = text.replace(" ","\n")
text = text.replace(" \n","\n")
text = text.replace(" \n","\n")
text = text.replace("\n ","\n")
text = text.replace("\n ","\n")
text = text.replace("\n"*4,"\n")
text = text.replace("\n"*3,"\n")
text = text.replace("\n"*2,"\n")
data = text.split("\n")
data1 = []
for i in data:
if ("首页" in i or
"上一页" in i or
"下一页" in i):
pass
elif ("/" in i) and (len(i)<10):
pass
else:
data1.append(i)
if len(data1[0])<10:
data = data1[1:]
else:
data = data1
text = []
city = []
for i in range(len(data)-1):
if len(data[i])>10 and len(data[i+1])<10:
text.append(" ".join(data[i].split(" ")[1:]))
city.append(data[i+1].replace("0","").replace("1","").replace("2","").replace("3","").replace("4","").replace("5","").replace("6","").replace("7","").replace("8","").replace("9",""))
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["地点","微博内容"])
for i in range(len(city)):
ws.append([city[i],text[i]])
wb.save("data.xlsx")
print("OK")
|