-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlast_revision_analysis.py
59 lines (48 loc) · 2.06 KB
/
last_revision_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import json
import re
import fuzzy_match
from tqdm import tqdm
for SHARD in range(4):
DIRS = [f"edit_history/edit_history_{i}" for i in range(1, 5)]
ITER_DIR = "edit_history/iter/"
with open(os.path.join(ITER_DIR, "probe_live_broken_links.json"), "r", encoding="utf-8") as file:
ALL_BROKEN_LINKS = json.load(file)
ALL_AUG_BIASED = 0
ALL_REMOVED = 0
ALL_NO_ACTION = 0
def process_dir(dir_path):
global ALL_AUG_BIASED, ALL_REMOVED, ALL_NO_ACTION
print(dir_path)
for filename in tqdm(os.listdir(dir_path)):
if not filename.endswith(f"-last.txt"):
continue
article_name = filename.replace(f"-last.txt", "")
if article_name not in ALL_BROKEN_LINKS[dir_path]:
continue
revision_last = open(f"{dir_path}/{filename}", 'r').read()
arn = {}
# first check augmentation; elif check removal
for link in ALL_BROKEN_LINKS[dir_path][article_name]["list_of_links"]:
link_url = link["url"]
aug_match = fuzzy_match.check_if_augmented(link_url, revision_last, {})
removed = fuzzy_match.check_if_removed(link_url, revision_last)
if aug_match:
link["last_revision_AUG_REM"] = {
"type": "augmented",
"match": aug_match
}
ALL_AUG_BIASED += 1
elif removed:
link["last_revision_AUG_REM"] = { "type": "removed" }
ALL_REMOVED += 1
else:
link["last_revision_AUG_REM"] = { "type": "no_action"}
ALL_NO_ACTION += 1
arn[link["url"]] = link
with open(f"{dir_path}/{article_name}-arn.json", "w", encoding="utf-8") as file:
json.dump(arn, file, indent=4)
# if __name__ == "__main__":
process_dir(DIRS[SHARD])
print(ALL_AUG_BIASED, ALL_REMOVED, ALL_NO_ACTION,
ALL_AUG_BIASED + ALL_REMOVED + ALL_NO_ACTION)