-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremoval_reason_filters.py
101 lines (85 loc) · 2.39 KB
/
removal_reason_filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
import os
import re
FILTERS = [
# our interest
# r"fix link",
# r"fix ref",
# r"dead link",
# r"found working.*links",
# r"ref.* fix",
# r"remove link",
# r"remove ref",
# ---???---
r"external links",
r"archive.today",
r"archive.is",
r"overlinking",
r"too many link",
r"too many ref",
# script assisted formats
r"script.*assisted fix.*\[\[mos:\s*.*?\]\]",
r"\[\[.*use this bot\]\]",
r"citation bot",
r"referenceexpander",
r"waybackmedic",
# Content based
r"observance",
r"source.*skeptical",
r"factual",
r"vague.*not.*related",
r"repetition",
r"remove.*region-specific",
r"rule change",
r"remove.*conjecture from",
r"not relevant",
r"wrong",
r"standardize.*citation",
r"update",
r"reorganise",
r"unreliable",
r"random",
# section moves
r"moving.*to.*",
r"move.*to.*",
r"section.*move",
# empty
# r"^\s*/\*.*\*/\s*$",
# r"^$",
# # misc
# r"detail revision",
# r"cleaning up of section",
# r"filled in 2 bare reference",
# r"neither of the sources",
# r"myth",
# r"tidy some references"
]
# print(len(results))
# # Apply regex-based filtering
# # len(r["remove-purposely"][-1]["edit_meta_from"].get("comment", "")) <= 200 and
# results = [
# r for r in results
# if not any(
# re.search(
# f,
# r["remove-purposely"][-1]["edit_meta_from"].get("comment", "").lower()
# ) for f in filters
# )
# ]
# print(len(results))
# probe_archive_queue = []
# for item in results:
# # item = results[0]
# oldid = item["remove-purposely"][-1]["edit_meta_from"]["id"]
# article_name = item["article_name"]
# wiki_diff_link = f"https://en.wikipedia.org/w/index.php?title={article_name.replace(" ", "_")}&diff=prev&oldid={oldid}"
# # print(json.dumps(results[0], indent=2))
# # print(json.dumps({
# probe_archive_queue.append({
# "url": item["url"],
# "wiki_diff_link": wiki_diff_link,
# "timestamp": item["remove-purposely"][-1]["edit_meta_from"]["timestamp"],
# "comment": item["remove-purposely"][-1]["edit_meta_from"].get("comment", ""),
# "username": item["remove-purposely"][-1]["edit_meta_from"].get("username", item["remove-purposely"][-1]["edit_meta_from"].get("ip", "Unknown")),
# })
# print(probe_archive_queue[0])