自然语言处理当中经常需要字符串的查找操作,比如通过查找返回字串在文本当中的位置,比如通过匹配实现的ner
import pandas as pd
import asyncio
# data = pd.read_csv("guba_fc_result_20230413.csv")
data = pd.read_csv("guba_all_post_20230413.csv")
filename = "cate_group.txt"
def read_list_from_file(filename):
with open(filename, 'r') as f:
lst = [line.strip() for line in f]
return lst
cate_group = read_list_from_file(filename)
import marisa_trie
# 构建trie
trie = marisa_trie.Trie(cate_group)
# 匹配长字符串
# long_string = '宁德时代是做锂电池的'
# results = []
# for i in range(len(long_string)):
# matches = trie.prefixes(long_string[i:])
# # 输出匹配结果
# if matches:
# for matche in matches:
# results.append(matche)
# print(results)
async def match_text(long_string):
long_string = str(long_string)
results = []
for i in range(len(long_string)):
matches = trie.prefixes(long_string[i:])
# 输出匹配结果
if matches:
for matche in matches:
results.append(matche)
return results
async def main():
tasks = []
for i in data["text"]:
tasks.append(asyncio.create_task(match_text(i)))
matches_list = await asyncio.gather(*tasks)
data["matches"] = matches_list
print(matches_list)
data.to_csv("guba_all_matches_20230413.csv")
if __name__ == '__main__':
asyncio.run(main())
多思考也是一种努力,做出正确的分析和选择,因为我们的时间和精力都有限,所以把时间花在更有价值的地方。