-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchnSegment.py
38 lines (33 loc) · 1.32 KB
/
chnSegment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# coding:utf-8
from collections import Counter
from os import path
import jieba
jieba.load_userdict(path.join(path.dirname(__file__),'userdict//userdict.txt')) # 导入用户自定义词典
stopwords_path = 'userdict/stopword.txt'
def word_segment(text):
'''
通过jieba进行分词并通过空格分隔,返回分词后的结果
'''
# 移除一些无意义词语 'stopword.txt'
mywordlist = []
jieba.suggest_freq(('绿', '衣裳'), True) # 人为设定需要拆分的词语
seg_list = jieba.cut(text, cut_all=False)
liststr = "/".join(seg_list)
with open(stopwords_path) as f_stop:
f_stop_text = f_stop.read()
f_stop_text = f_stop_text
f_stop_seg_list = f_stop_text.split('\n')
for myword in liststr.split('/'): # 去除停顿词,生成新文档
if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
mywordlist.append(myword)
text = ' '.join(mywordlist)
# 词频存入txt文件
dataDict=Counter(mywordlist)
# 排序
sorted_data = sorted(dataDict.items(), key=lambda item: item[1], reverse=True)
with open('doc//词频统计.txt','w') as fw:
for k,v in sorted_data:
fw.write("%s,%d\n" % (k,v))
# fw.write("%s"%dataDict)
print("词频统计:doc/词频统计.txt")
return text