-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrule-based-classification.py
495 lines (387 loc) · 19 KB
/
rule-based-classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#!/usr/bin/env python
# coding: utf-8
"""
Script to apply a rule-based algorithm to guess whether particular sentences in an
EU legislative document (as found on EURLEX in PDF / HTML format) can be classified as
either regulatory (containing a legal obligation) vs. constitutive (not containing a legal obligation)
Website: http://eur-lex.europa.eu/
"""
import pandas as pd
import spacy
import os
import json
import argparse
import sys
from os.path import exists
# Load the English language model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('merge_noun_chunks')
argParser = argparse.ArgumentParser(description='Regulatory vs. Non-regulatory sentence classifier for EU legislation based on NLP dependency analysis')
required = argParser.add_argument_group('required arguments')
required.add_argument("-in", "--input", required=True, help="Path to input CSV file. Must have at least one column with header 'sent' containing sentences from EU legislation in English.")
required.add_argument("-out", "--output", required=True, help="Path to output CSV file in which to store the classification results.")
required.add_argument("-agts", "--agents", required=True, help="Path to JSON file which contains data of the form {'agent_nouns' : [...list of lowercase English word strings, each of which represents an entity with agency...]}. Some example words include 'applicant', 'court', 'tenderer' etc.")
args = argParser.parse_args()
IN_FNAME = str(args.input) # Input filename
OUT_FNAME = str(args.output) # Output filename
AGENT_DICT_FILE = str(args.agents) # Agents dictionary JSON file (data inside must be of the form {'agent_nouns' : [...list of English word strings, each of which represents an entity with agency...]})
DEONTICS = ['shall ', 'must ', 'shall not ', 'must not '] # List of relevant deontic phrases
EXCLUDED_PHRASES = ["shall apply", "shall mean", "this regulation shall apply", "shall be binding in its entirety and directly applicable in the member states", "shall be binding in its entirety and directly applicable in all member states", "shall enter into force", "shall be based", "within the meaning", "shall be considered"] # if these phrases occur in a sentence it means it must be constitutive
EXCLUDED_START_PHRASES = ['amendments to decision', 'amendments to implementing decision', 'in this case,', 'in such a case,', 'in such cases,'] # if these phrases occur at the START of a sentence, it must be constitutive
EXCLUDED_ATTR = ["Directive", "Directives", "Protocol", "Protocols", "Decision", "Decisions", "Paragraph", "Paragraphs", "Article", "Articles", "Agreement", "Agreements", "Annex", "Annexes", "ID", "IDs", "Certification", "Certifications", "Fund", "Funds", "PPE", "Regulation", "Regulations", "CONFIDENTIEL UE/EU CONFIDENTIAL", "instrument", "instruments", "signature", "signatures", "safeguard", "discharge", "examination", "authorisation", "relief", "investigations", "july", "june", "february", "january", "march", "april", "may", "august", "september", "october", "november", "december", "emergencies", "capacities", "actions", "resources", "agreements", "chapter", "applications", "(i)", "(ii)", "(iii)", "(iv)", "(v)", "(vii)", "(viii)", "elimination", "material", "banknotes", "guidelines", "vaccination", "official journal", "invalidation", "box", "hereinafter", "coordination", "revenue", "chromatography", "accreditation", "appointments", "termination", "combination", "contracts", "determination", "threshold", "targets", "controls", "designation", "consultation", "factor", "strategy", "import", "fields", "sides", "security", "subparagraph"] # these phrases can never be part of an attribute (agent being regulated) name
START_TOKENS = ['Article', 'Chapter', 'Section', 'ARTICLE', 'CHAPTER', 'SECTION', 'Paragraph', 'PARAGRAPH'] # tokens at the start of a sentence that can be pruned (indicates that the sentenciser did not break up text into clean sentences)
KNOWN_ATTR = {'director' : 'Director', 'directorate-general' : 'Directorate-General', 'director-general' : 'Director-General', 'directorates-general' : 'Directorates-General', 'member states' : 'Member States', 'member state' : 'Member State'} # known attributes and phrasing
with open(AGENT_DICT_FILE) as json_file: # import list of agent nouns (manually curated subset from ConceptNet: https://github.com/commonsense/conceptnet5/wiki/Downloads)
AGENT_NOUNS = json.load(json_file)['agent_nouns']
# BEGIN: function definitions
def extract_verb_with_aux(sent, input_word):
global nlp
doc = nlp(sent)
# Find the input word in the sentences
tokens = []
for t in doc:
if t.text == input_word:
token = t
tokens.append(token)
if len(tokens) == 0:
return []
verbs = []
for token in tokens:
for parent in token.ancestors:
if parent.pos_ in ['VERB', 'AUX']: # AUX verbs are also possible e.g. "shall be"
verb = parent.text
verbs.append(verb)
return verbs
def traverse_ancestors_recursive(token, results_s, results_o, dep_seq):
# Base case: No more ancestors to traverse
if not token.ancestors:
return
# Traverse ancestors recursively until 'nsubj', 'pobj', 'agent' is found or no more ancestors are left
for ancestor in token.ancestors:
if ancestor.dep_ == 'agent':
dep_seq.append('agent')
if ancestor.dep_ == 'nsubj':
dep_seq.append('nsubj')
results_s.append({'text' : ancestor.text, 'pos' : ancestor.pos_})
elif ancestor.dep_ == 'pobj':
dep_seq.append('pobj')
if len(dep_seq) >= 2:
if (dep_seq[len(dep_seq)-1] == 'pobj') and (dep_seq[len(dep_seq)-2] in ['agent', 'prep']): # consecutive agent->pobj or prep->pobj dependencies
results_o.append({'text' : ancestor.text, 'pos' : ancestor.pos_})
traverse_ancestors_recursive(ancestor, results_s, results_o, dep_seq)
def traverse_children_recursive(token, results_s, results_o, dep_seq):
# Base case: No more children to traverse
if not token.children:
return
# Traverse children recursively until 'nsubj', 'pobj', 'agent' is found or no more ancestors are left
for child in token.children:
if child.dep_ == 'agent':
dep_seq.append('agent')
if child.dep_ == 'nsubj':
dep_seq.append('nsubj')
results_s.append({'text' : child.text, 'pos' : child.pos_})
elif child.dep_ == 'pobj':
dep_seq.append('pobj')
if len(dep_seq) >= 2:
if (dep_seq[len(dep_seq)-1] == 'pobj') and (dep_seq[len(dep_seq)-2] in ['agent', 'prep']): # consecutive agent->pobj or prep->pobj dependencies
results_o.append({'text' : child.text, 'pos' : child.pos_})
traverse_children_recursive(child, results_s, results_o, dep_seq)
def get_subjects_and_objects(sentence, input_word):
""" Traverses dependency tree to find subjects or objects associated with input deontic (for the legal obligation)
Parameters
----------
sentence: str
Input sentence
input_word: str
Deontic phrase
Returns
-------
List of subjects / objects associated with the verb(s) associated with the input deontic phrase
"""
global nlp
verbs = extract_verb_with_aux(sentence, input_word)
if len(verbs) == 0:
return -1, -1, -1
doc = nlp(sentence)
subjs = []
objs = []
for verb in verbs:
# Find the input word in the sentence
token = None
for t in doc:
if t.text == verb.lower():
token = t
break
if token is None:
return [], [], ''
dependency_sequence = []
results_s_a = []
results_o_a = []
results_s_c = []
results_o_c = []
traverse_ancestors_recursive(token, results_s_a, results_o_a, dependency_sequence)
traverse_children_recursive(token, results_s_c, results_o_c, dependency_sequence)
subjs.extend((results_s_a + results_s_c))
objs.extend((results_o_a + results_o_c))
return subjs, objs, dependency_sequence
def contains_sequence(lst, item1, item2):
""" Checks whether a two given items appear consecutively in a given list.
Parameters
----------
lst: str
The input list
item1:
A list item of any type that can appear in a Python list
item2:
Another list item of any type that can appear in a Python list
Returns
-------
True if item1 and item2 appear consecutively (in order) in the input list, False otherwise
"""
for i in range(len(lst) - 1):
if lst[i] == item1 and lst[i + 1] == item2:
return True
return False
def is_valid_sentence(sent):
""" Checks whether a given sentence could possibly be regulatory in nature.
* Certain heuristics can determine that a sentence cannot be regulatory
Parameters
----------
sent: str
The input sentence
Returns
-------
True if sent could possibly be regulatory, False if sent must be constitutive in nature
"""
global EXCLUDED_PHRASES
global EXCLUDED_START_PHRASES
for phrase in EXCLUDED_PHRASES:
if (phrase in sent.lower()) or (phrase in clean_sentence(sent).lower()):
return False
for start_phrase in EXCLUDED_START_PHRASES:
if sent.lower().startswith(start_phrase):
return False
return True
def get_index_of_next_upper_case_token(sent_tokens, start_index = 3):
"""Gets index of first word (after the given start_index) in list of words
which starts with an uppercase character.
Parameters
----------
sent_tokens: list
List of words.
start_index: int
the starting index from which the function starts searching
Returns
-------
i: int
the first index after start_index which has a word starting with an uppercase character
"""
for i in range(start_index, len(sent_tokens)):
if sent_tokens[i][0].isupper():
return i
return -1
def clean_sentence(sent):
"""Formats a sentence to be more easily processed downstream for classifying them as regulatory or not.
Parameters
----------
sent: str
The sentence.
Returns
-------
The processed sentence.
"""
global START_TOKENS
sent_tokens = sent.split()
if sent_tokens[0].strip() in START_TOKENS:
if sent_tokens[1].strip().isnumeric():
if sent_tokens[2].strip()[0].isupper():
# find position / index of next upper case token in sent
i = get_index_of_next_upper_case_token(sent_tokens)
if i > 2:
return ' '.join(sent_tokens[i:])
else:
return ' '.join(sent_tokens[3:])
else:
return ' '.join(sent_tokens[2:])
else:
return ' '.join(sent_tokens)
else:
return ' '.join(sent_tokens)
def get_deontic_type(sent, deontics=DEONTICS):
""" Identifies which deontic words appear in a given sentence.
Parameters
----------
sent: str
Input sentence
deontics: list
List of deontic words or phrases
Returns
-------
Pipe-delimited string of deontic phrases in the sentence
"""
global DEONTICS
result = []
for deontic in deontics:
if deontic in (" ".join(sent.split())):
result.append(deontic)
if len(result) == 0:
return 'None'
else:
return ' | '.join(result)
def contains_kw(stri, listofwords):
""" Checks if any one of a list of phrases appears in a given string
Parameters
----------
stri: str
Input string
listofwords: list
List of phrases
Returns
-------
True if at least one of the phrases in the list appears in the input string, False otherwise
"""
for item in listofwords:
if item in stri:
return True
return False
def contains_kw_token(stri, listofwords):
""" Checks if any one of a list of phrases appears in a given string
Similar to contains_kw() function. Checks if any token within input string appears in list
Parameters
----------
stri: str
Input string
listofwords: list
List of phrases
Returns
-------
True if at least one of the phrases in the list appears in the input string, False otherwise
"""
tokens = stri.split()
for token in tokens:
if contains_kw(token, listofwords):
return True
return False
def is_agent_token(token):
""" Checks if given word / token is an agent noun
Words such as 'manufacturer', 'applicant' indicate agenthood but cannot be recognized by traditional
NLP NER and POS-tagging algorithms. We use a dictionary approach here from ConceptNet:
https://github.com/commonsense/conceptnet5/wiki/Downloads
Parameters
----------
token: str
Input word
Returns
-------
True if the input word appears in the predefined agent noun dictionary, False otherwise.
"""
global AGENT_NOUNS
for item in AGENT_NOUNS:
if item in token.strip():
return True
return False
def contains_agent_noun(stri):
""" Checks if given phrase contains agent nouns as either the first or last token
If the first or last token is an agent noun, chances are that that full phrase indicates an agent noun phrase
E.g. "applicant of the shipment" (first token "applicant" is an agent noun, makes the entire phrase an agent noun phrase)
E.g. "legal advisor" (last token "advisor" is an agent noun, makes entire phrase an agent noun phrase)
Parameters
----------
stri: str
Input noun phrase
Returns
-------
True if the input noun phrase is an agent noun phrase, False otherwise.
"""
tokens = stri.split()
if len(tokens) > 0:
first_token = tokens[0]
if is_agent_token(first_token.lower()):
return True
if (len(tokens)-1) > 0: # more than 1 token in noun phrase
last_token = tokens[len(tokens)-1]
if is_agent_token(last_token.lower()):
return True
return False
def is_regulatory_or_constitutive(sent):
""" A rule-based algorithm based on grammatical dependency parsing to guess whether
an input sentence contains a regulatory statement or (in the converse case - constitutive).
Parameters
----------
sent: str
Input string
Returns
-------
dict {'pred' : val, 'attr' : attr} where 'val' is either a 0 (constitutive) or 1 (regulatory)
and 'attr' is the algorithm's guess at the name of the entity being regulated
in the input sentence (if any)
"""
global nlp
global KNOWN_ATTR
if is_valid_sentence(sent):
sent = clean_sentence(sent)
for item in KNOWN_ATTR:
sent.replace(item, KNOWN_ATTR[item])
deontic_types = get_deontic_type(sent)
input_word = ''
if 'shall ' in deontic_types.split(' | ') or 'shall not ' in deontic_types:
input_word = 'shall'
else:
input_word = deontic_types[0].strip().split()[0].strip()
subjs, objs, depseq = get_subjects_and_objects(sent, input_word)
if subjs == -1:
# cannot be a regulatory sentence
return {'pred' : 0, 'attr' : ''}
else:
propns = []
nonpropns = []
# Subjs in sentence
if len(subjs) > 0:
for item in subjs:
if len(item['text'].strip()) > 1:
if (item['pos'] == 'PROPN') and not contains_kw_token(item['text'], EXCLUDED_ATTR): # 1. Best case: proper noun subject
propns.append(item['text'])
if (item['pos'] == 'NOUN') and contains_agent_noun(item['text']) and not contains_kw_token(item['text'], EXCLUDED_ATTR): # 2. Next best case: check if subject is an agent noun (ConceptNet subset)
propns.append(item['text'])
if len(propns) > 0:
return {'pred' : 1, 'attr' : '|'.join(propns)}
# 3. Check for 'they' or 'it' subjects
# for item in subjs:
# if len(item['text'].strip()) > 1:
# if (item['pos'] == 'PRON') and item['text'].lower().strip() in ['they', 'it']:
# propns.append(item['text'])
# if len(propns) > 0:
# return {'pred' : 1, 'attr' : '|'.join(propns)}
# No subjs in sentence
else:
# 3. Third best case: check for passive voice objects (hidden subjects)
if (contains_sequence(depseq, 'agent', 'pobj') or contains_sequence(depseq, 'prep', 'pobj')):
for item in objs:
if len(item['text'].strip()) > 1:
if (item['pos'] == 'PROPN') and not contains_kw_token(item['text'], EXCLUDED_ATTR):
propns.append(item['text'])
if (item['pos'] == 'NOUN') and contains_agent_noun(item['text']) and not contains_kw_token(item['text'], EXCLUDED_ATTR): # check if objects are agent nouns
propns.append(item['text'])
if len(propns) > 0:
return {'pred' : 1, 'attr' : '|'.join(propns)}
return {'pred' : 0, 'attr' : ''}
else:
return {'pred' : 0, 'attr' : ''}
# END: function definitions
# BEGIN: apply classifier to input sentences
df = pd.read_csv(IN_FNAME)
rule_predictions = []
attributes = []
for index, row in df.iterrows():
print(index, '/', len(df))
if (len(row['sent'].split()) > 1000):
prediction = {'pred' : 0, 'attr' : ''}
else:
prediction = is_regulatory_or_constitutive(row['sent'])
rule_predictions.append(prediction['pred'])
attributes.append(prediction['attr'])
# extend dataframe with classifier predictions in two new columns
df['regulatory_according_to_rule'] = rule_predictions
df['attribute_according_to_rule'] = attributes
# write dataframe result to file
df.to_csv(OUT_FNAME, index=False)