-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle_searcher.py
126 lines (107 loc) · 4.16 KB
/
article_searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Simple example of streaming a Wikipedia
# Copyright 2017 by Jeff Heaton, released under the The GNU Lesser General Public License (LGPL).
# http://www.heatonresearch.com
# -----------------------------
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
PATH_WIKI_XML = './'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"
# Nicely formatted time string
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def strip_tag_name(t):
idx = k = t.rfind("}")
if idx != -1:
t = t[idx + 1:]
return t
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)
totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
start_time = time.time()
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL, )
redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)
articlesWriter.writerow(['id', 'title', 'redirect'])
redirectWriter.writerow(['id', 'title', 'redirect'])
templateWriter.writerow(['id', 'title'])
i = 0
target = 0
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
import xml
if i > target - 500 or True:
if len(list(elem)) > 1:
for u in elem:
tt = strip_tag_name(u.tag)
if tt == 'title':
if u.text == 'Chile at the 2004 Summer Olympics':
xml.etree.ElementTree.dump(elem)
import pdb; pdb.set_trace()
if i > target + 500 and False:
break
i+=1
if event == 'start':
if tname == 'page':
title = ''
id = -1
redirect = ''
inrevision = False
ns = 0
elif tname == 'revision':
# Do not pick up on revision id's
inrevision = True
else:
if tname == 'title':
title = elem.text
elif tname == 'id' and not inrevision:
id = int(elem.text)
# if id == 18350195:
# import pdb; pdb.set_trace()
elif tname == 'redirect':
redirect = elem.attrib['title']
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'page':
totalCount += 1
# if id == 18350195:
# import pdb; pdb.set_trace()
if ns == 10:
templateCount += 1
# templateWriter.writerow([id, title])
elif len(redirect) > 0:
articleCount += 1
# articlesWriter.writerow([id, title, redirect])
else:
redirectCount += 1
# redirectWriter.writerow([id, title, redirect])
# if totalCount > 100000:
# break
if totalCount > 1 and (totalCount % 100000) == 0:
print("{:,}".format(totalCount))
elem.clear()
elapsed_time = time.time() - start_time
print("Total pages: {:,}".format(totalCount))
print("Template pages: {:,}".format(templateCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))