-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep_01_concept_extraction.py
167 lines (138 loc) · 6.81 KB
/
step_01_concept_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os.path
import nltk
import json
from tqdm import tqdm
import pandas as pd
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.data import find
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from fuzzywuzzy import fuzz
def download_if_needed(package):
"""
Downloads the specified NLTK package if it is not already downloaded.
"""
try:
find(f"corpora/{package}")
except LookupError:
nltk.download(package)
def step_01_concept_extraction(texts: list[str],
concept_extraction_output_file: str,
concept_abstracts_output_file: str,
logging: any,
stop_words: list[str] = None,
config: dict[str, any] = None):
"""
Step 1: Concept Extraction
Extracts concepts from the texts. The output is an output file listing all concepts
and an output json file listing the abstracts per concept.
:param texts: the texts to extract concepts from
:param concept_extraction_output_file: the file to write the extracted concepts to
:param concept_abstracts_output_file: the file to write the abstracts per concept to
:param logging: the logger
:param stop_words: the stop words to use
:param config: the configuration can be provided with the following keys: language, gold_concept_file
:return: None
"""
download_if_needed("wordnet")
download_if_needed("omw-1.4")
logging.info("Step 1: Starting concept extraction.")
# set default config values if not provided
if 'language' not in config:
config['language'] = "english"
if 'gold_concept_file' not in config:
config['gold_concept_file'] = ""
# create BERTopic Extractor
# language dependent part
if config['language'] == "english":
vectorizer_model = CountVectorizer(ngram_range=(2, 4),
stop_words="english" if stop_words is None else stop_words)
sentence_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
else:
logging.info(f"Using language {config['language']}.")
logging.info("Language not yet supported. Exiting.")
exit(0)
# language independent part
umap_model = UMAP(n_neighbors=20, n_components=50, metric="cosine", min_dist=0.0, random_state=37)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)
representation_model = KeyBERTInspired()
topic_model = BERTopic(verbose=True,
umap_model=umap_model,
ctfidf_model=ctfidf_model,
vectorizer_model=vectorizer_model,
embedding_model=sentence_model,
representation_model=representation_model,
nr_topics=50,
low_memory=True,
calculate_probabilities=False)
topics, _ = topic_model.fit_transform(texts)
all_topics = topic_model.get_topics()
extracted_concepts = []
for topic_num, keywords in all_topics.items():
if topic_num != -1:
topic_keywords = [word for word, value in keywords]
extracted_concepts.extend(topic_keywords)
# remove duplicates
extracted_concepts = list(set(keyword.lower() for keyword in extracted_concepts))
# write extracted concepts to file
with open(concept_extraction_output_file, "w") as f:
for id, concept in enumerate(extracted_concepts, 1):
f.write(f"{id}|{concept}\n")
logging.info(f"Concepts written to {concept_extraction_output_file}.")
lemmatizer = WordNetLemmatizer()
def singularize_concept(concept):
words = concept.split()
singular_words = [lemmatizer.lemmatize(word, wordnet.NOUN) for word in words]
return ' '.join(singular_words)
# singularize concepts
extracted_concept = [singularize_concept(concept) for concept in extracted_concepts]
df_concepts = pd.DataFrame(extracted_concept, columns=["concept"])
df_concepts["label"] = 0
if config['gold_concept_file'] != "":
if os.path.exists(config['gold_concept_file']):
gold_concepts = pd.read_csv(config['gold_concept_file'], delimiter="|", header=None)
gold_concepts = gold_concepts[1].tolist()
# singularize concepts
gold_concept = [singularize_concept(concept) for concept in gold_concepts]
# convert to lowercase
gold_concept = [concept.lower() for concept in gold_concept]
df_gold_concepts = pd.DataFrame(gold_concept, columns=["concept"])
df_gold_concepts["label"] = 1
df_concepts = pd.concat([df_concepts, df_gold_concepts])
df_concepts = df_concepts.sort_values(by="label")
else:
logging.info(f"Gold concept file {config['gold_concept_file']} not found. Skipping.")
df_concepts = df_concepts.drop_duplicates(subset="concept", keep="first")
# reduce the text dataset to only texts containing the concepts
def filter_abstracts_by_term(term, abstracts, threshold=70):
filtered_abstracts = []
for abstract in abstracts:
if isinstance(abstract, str):
if fuzz.partial_ratio(term.lower(), abstract.lower()) >= threshold:
filtered_abstracts.append(abstract)
return filtered_abstracts
concept_abstracts = {}
for index, row in tqdm(df_concepts.iterrows(), desc="Processing concepts",
total=df_concepts.shape[0]):
concept = row["concept"]
label = row["label"]
filtered_abstracts = filter_abstracts_by_term(concept, texts)
concept_abstracts[concept] = {
"abstracts": filtered_abstracts,
"label": label
}
with open(concept_abstracts_output_file, 'w', encoding='utf-8') as f:
json.dump(concept_abstracts, f, ensure_ascii=False, indent=4)
logging.info(f"Abstracts written to {concept_abstracts_output_file}.")
logging.info("Step 1: Candidate Triple Extraction completed.")
logging.info(f"Number of concepts extracted through BERTopic: {len(extracted_concept)}")
if config['gold_concept_file'] != "":
label_0_count = sum(1 for details in concept_abstracts.values() if details['label'] == 0)
logging.info(f"Number of concepts added through BERTopic: {label_0_count}")
empty_abstracts_count = sum(1 for details in concept_abstracts.values() if not details['abstracts'])
logging.info(f"Number of concepts with empty filtered_abstracts: {empty_abstracts_count}")