-
Notifications
You must be signed in to change notification settings - Fork 4
/
vocab.py
97 lines (78 loc) · 2.94 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""Construct English/Chinese vocab for csv files.
Author: Yixu GAO ([email protected])
Use nltk to cut English sentences.
Use jieba to cut Chinese sentences.
Usage:
from vocab import Vocab
vocab = Vocab('zh')
vocab.load_file_to_dict('SMP-CAIL2020-train.csv', list(range(2, 8)))
vocab.load_file_to_dict('SMP-CAIL2020-test1.csv', list(range(2, 8)))
vocab.load_file_to_dict('SMP-CAIL2020-test2.csv', list(range(2, 8)))
vocab.write2file('vocab.txt', False)
In SMP-CAIL2020-Argmine task, simply run this file
python vocab.py
or
from vocab import build_vocab
build_vocab()
"""
from typing import List, Dict
import jieba
import nltk
import pandas as pd
class Vocab:
"""Build vocab for files and write to a file.
Attributes:
language: 'zh' for Chinese and 'en' for English
word_dict: word frequency dict, {<word>: frequency}
e.g. {'我': 30}
"""
def __init__(self, language='zh', word_dict: Dict[str, int] = None):
"""Initialize with language type and word_dict.
Args:
language: 'zh' or 'en'
"""
self.language = language
self.word_dict = word_dict if word_dict else {}
def load_file_to_dict(
self, filename: str, cols: List[int] = None) -> Dict[str, int]:
"""Load columns of a csv file to word_dict.
Args:
filename: a csv file with ',' as separator
cols: column indexes to be added to vocab
Returns:
word_dict: {<word>: frequency}
"""
data_frame = pd.read_csv(filename)
if not cols:
cols = range(data_frame.shape[1])
for row in data_frame.itertuples(index=False):
for i in cols:
sentence = row[i]
if self.language == 'zh':
words = jieba.lcut(sentence)
else: # 'en'
words = nltk.word_tokenize(sentence)
for word in words:
self.word_dict[word] = self.word_dict.get(word, 0) + 1
return self.word_dict
def write2file(self,
filename: str = 'vocab.txt', fre: bool = False) -> None:
"""Write word_dict to file without file head.
Each row contains one word with/without its frequency.
Args:
filename: usually a txt file
fre: if True, write frequency for each word
"""
with open(filename, 'w', encoding='utf-8') as file_out:
for word in self.word_dict:
file_out.write(word)
if fre:
file_out.write(' ' + str(self.word_dict[word]))
file_out.write('\n')
def build_vocab(file_in, file_out):
"""Build vocab.txt for SMP-CAIL2020-Argmine."""
vocab = Vocab('zh')
vocab.load_file_to_dict(file_in, list(range(2, 8)))
vocab.write2file(file_out, False)
if __name__ == '__main__':
build_vocab('data/SMP-CAIL2020-train.csv', 'vocab.txt')