|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- # -*- coding: utf-8 -*-
- import re
- alphabets= "([A-Za-z])"
- prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
- suffixes = "(Inc|Ltd|Jr|Sr|Co)"
- starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
- acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
- websites = "[.](com|net|org|io|gov|edu|me)"
- digits = "([0-9])"
- multiple_dots = r'\.{2,}'
- def split_into_sentences(text: str) -> list[str]:
- """
- Split the text into sentences.
- If the text contains substrings "<prd>" or "<stop>", they would lead
- to incorrect splitting because they are used as markers for splitting.
- :param text: text to be split into sentences
- :type text: str
- :return: list of sentences
- :rtype: list[str]
- """
- text = " " + text + " "
- text = text.replace("\n"," ")
- text = re.sub(prefixes,"\\1<prd>",text)
- text = re.sub(websites,"<prd>\\1",text)
- text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
- text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
- if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
- text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
- text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
- text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
- text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
- text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
- text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
- text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
- if "”" in text: text = text.replace(".”","”.")
- if """ in text: text = text.replace("."","".")
- if "!" in text: text = text.replace("!"",""!")
- if "?" in text: text = text.replace("?"",""?")
- text = text.replace(".",".<stop>")
- text = text.replace("?","?<stop>")
- text = text.replace("!","!<stop>")
- text = text.replace("<prd>",".")
- # text = re.sub(r'([,.?!][\'"»’]*)(\w)', r'\1 \2', text)
- text = re.sub(r'([,][\'"»’]*)(\w)', r'\1 \2', text)
- # --- 保留并优化原函数中一些有用的规则 ---
- # 规则2: 去除标点符号(.?!,;:)之前的多余空格
- # 例如:"Hello ." -> "Hello."
- text = re.sub(r'\s+([.?!,;:])', r'\1', text)
- text = re.sub(r' +', ' ', text)
- sentences = text.split("<stop>")
- sentences = [s.strip() for s in sentences]
- if sentences and not sentences[-1]: sentences = sentences[:-1]
- return sentences
- text = """
- I love you.I am Tom. Hello,world!How are you ?
- Dr. Smith and Mr. Jones met at 3:30 p.m. The meeting was held at example.com.
- "I love this project!" exclaimed Prof. Brown. "It's amazing."
- The U.S. Department of Energy (DOE) reported a 2.5% increase in 3.15 at Beijing.
- I love you. Tom said.
- I love you.Tom said.
- “苏超”联赛火爆出圈,场均观众8798人远超中甲,门票一票难求!全民参与+城市荣誉模式激活经济内循环,地域热梗成看点,文旅消费激增,政府主导打造“移动的城市广告”。
- 网上热度也是相当狂飙。 虎扑App紧急新增“江苏联”频道,上线首日访问量破百万;抖音话题#江苏城市联赛#播放量破亿,素人拍摄的赛事短视频占比超70%……第三轮的门票已经是花钱都抢不到了,甚至有球迷在二手物品交易平台表示,愿意花100元求购一张门票。
- But then Wei poured a shower of gold coins into her lap."Never mind where I got them", he whispered. "Let's just say... I made a brilliant business deal!"Mei said nothing — she was too busy polishing the gold.Now news travels fast.Their neighbor,Jin ,soon, heard, that Wei had returned from a big business trip and was now rich. His wife heard too?"Brilliant deal, eh?" she said to him. "If that fool Wei can make all that money,why can't you?"
-
-
- """
- print(split_into_sentences(text))
复制代码 |
|