-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
183 lines (161 loc) · 9.06 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import json
import logging
from models import KnowledgeGraphLLM
from argparse import ArgumentParser
from step_01_concept_extraction import step_01_concept_extraction
from step_02_triple_extraction import step_02_triple_extraction
from step_03_fusion import step_03_fusion
if __name__ == "__main__":
argparse = ArgumentParser()
argparse.add_argument("--run_name", type=str, default="test",
help="Assign a name to this run. The name will be used to, e.g., determine "
"the output directory. We recommend to use unique and descriptive names "
"to distinguish the results of different models.")
argparse.add_argument("--dataset", type=str,
required=True,
help="Name of the dataset. Is used to, e.g., determine the input directory.")
argparse.add_argument("--relation_definitions_file", type=str,
required=True,
help="Path to the relation definitions file. The file should be a JSON file, "
"where the keys are the relation types and the values are dictionaries "
"with the following keys: 'label', 'description'.")
# these arguments allow to provide the data from the previous steps directly
# instead of running these steps
argparse.add_argument("--input_json_file", type=str, default="",
help="Path to the input file. Step 1 will be skipped if this argument "
"is provided. The input file should be a JSON file with the "
"following structure: "
"{'concept1': [{'abstract': ['abstract1', ...], 'label: 0},...} "
"E.g. data/test/concept_abstracts.json is the associated file created"
"durin step 1 in the test run.")
argparse.add_argument("--input_triple_file", type=str, default="",
help="Path to the input file storing the triples in the format as outputted "
"by the candidate triple extraction model. Step 1 and step 2 will "
"be skipped if this argument is provided.")
# these arguments allow to configure the LLM model
argparse.add_argument("--model", type=str, default="gpt-3.5-turbo",
help="Name of the LLM that should be used for the KG construction.")
argparse.add_argument("--max_resp_tok", type=int, default=200,
help="Maximum number of tokens in the response of the candidate triple "
"extraction model.")
argparse.add_argument("--max_input_char", type=int, default=10000,
help="Maximum number of characters in the input of the candidate triple "
"extraction model.")
argparse.add_argument("--prompt_tpextraction", type=str,
default="prompts/prompt_tpextraction.txt",
help="Path to the prompt template for step 1.")
argparse.add_argument("--prompt_fusion", type=str, default="prompts/prompt_fusion.txt",
help="Path to the prompt template for fusion.")
# these arguments allow to provide additional data
argparse.add_argument("--gold_concept_file", type=str,
default="",
help="Path to a file with concepts that are provided by experts. "
"The file should be a tsv file, each row should look like: "
"'concept id | concept")
argparse.add_argument('--refined_concepts_file', type=str,
default="True",
help='In step 2 (candidate triple extraction) many new concepts might be '
'added. Instead of using these, concepts can be provided through this '
'parameter. Specify the path to a file with refined concepts '
'of the graph. The file should be a tsv file, each row should look like: '
'"concept id | concept name"')
argparse.add_argument("--annotated_graph_file", type=str,
default="data/prerequisite_of_graph.tsv",
help="Path to the annotated graph.")
# language settings
argparse.add_argument('--language', type=str, default='english',
help='Language of the abstracts.')
# logging
argparse.add_argument('--verbose', action='store_true',
help='Print additional information to the console.')
# Parse the arguments
args = argparse.parse_args()
VERBOSE = args.verbose
RUN_NAME = args.run_name
RELATION_DEFINITIONS_FILE = args.relation_definitions_file
MODEL_NAME = args.model
MAX_RESPONSE_TOKEN_LENGTH_CANDIDATE_TRIPLE_EXTRACTION = args.max_resp_tok
PROMPT_TPEXTRACTION_FILE = args.prompt_tpextraction
PROMPT_FUSION_FILE = args.prompt_fusion
# --- Setup ---
# initialize logger
if VERBOSE:
logging_level = logging.DEBUG
else:
logging_level = logging.INFO
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info(f"RUN_NAME: {RUN_NAME}")
# Prepare the output directory
if not os.path.exists('output'):
os.makedirs('output')
if not os.path.exists(f'output/{RUN_NAME}'):
os.makedirs(f'output/{RUN_NAME}')
# write config to output directory
config = args.__dict__
json.dump(config, open(f'output/{RUN_NAME}/config.json', 'w'), indent=4)
# assign output file names if not provided
if args.input_json_file == "":
CONCEPT_EXTRACTION_OUTPUT_FILE = f'output/{RUN_NAME}/concepts.tsv'
CONCEPT_ABSTRACTS_OUTPUT_FILE = f'output/{RUN_NAME}/concept_abstracts.json'
else:
CONCEPT_ABSTRACTS_OUTPUT_FILE = args.input_json_file
logging.info(
f"Using provided input file: {CONCEPT_ABSTRACTS_OUTPUT_FILE} "
f"(skipping step 1 - concept extraction).")
if args.input_triple_file == "":
TRIPLE_EXTRACTION_OUTPUT_FILE = f'output/{RUN_NAME}/step-02.jsonl'
else:
TRIPLE_EXTRACTION_OUTPUT_FILE = args.input_triple_file
logging.info(
f"Using provided input file: {TRIPLE_EXTRACTION_OUTPUT_FILE} "
f"(skipping step 2 - triple extraction).")
# output file of the pipeline
FUSION_OUTPUT_FILE = f'output/{RUN_NAME}/step-03.jsonl'
# Load the relation definitions
relation_def = json.load(open(RELATION_DEFINITIONS_FILE, 'r'))
relation_types = list(relation_def.keys())
relation_2_id = {v: k for k, v in enumerate(relation_types)}
id_2_relation = {k: v for k, v in enumerate(relation_types)}
# Configure API keys
# os.environ["OPENAI_API_KEY"] = json.load(open('private_config.json'))['OPENAI_API_KEY']
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# init the LLM
model = KnowledgeGraphLLM(model_name=MODEL_NAME,
max_tokens=MAX_RESPONSE_TOKEN_LENGTH_CANDIDATE_TRIPLE_EXTRACTION)
# --- Pipeline ---
if args.input_json_file == "" and args.input_triple_file == "":
# load raw text data
if len(os.listdir(f'data/{args.dataset}/raw/')) == 0:
logging.error(f"No input text files found in data/{args.dataset}/raw/.")
texts = []
for file in os.listdir(f'data/{args.dataset}/raw/'):
if file.endswith('.txt'):
logging.info(f"Loading file: {file}")
with open(f'data/{args.dataset}/raw/{file}', 'r', encoding="utf-8") as f:
for line in f:
texts.append(line)
# extract concepts
step_01_concept_extraction(texts=texts,
concept_extraction_output_file=CONCEPT_EXTRACTION_OUTPUT_FILE,
concept_abstracts_output_file=CONCEPT_ABSTRACTS_OUTPUT_FILE,
logging=logging,
config=config)
# Load the abstract data (either created in step 1 or provided as input)
data = json.load(open(CONCEPT_ABSTRACTS_OUTPUT_FILE, 'r', encoding="utf-8"))
if args.input_triple_file == "":
step_02_triple_extraction(model=model,
output_file=TRIPLE_EXTRACTION_OUTPUT_FILE,
relation_def=relation_def,
data=data,
logging=logging,
config=config)
step_03_fusion(model=model,
input_file=TRIPLE_EXTRACTION_OUTPUT_FILE,
output_file=FUSION_OUTPUT_FILE,
relation_def=relation_def,
relation_2_id=relation_2_id,
data=data,
logging=logging,
config=config)