-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathparse_mysqldump.py
137 lines (114 loc) · 6.2 KB
/
parse_mysqldump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# coding: utf-8
import re
import gzip
import sys
from tqdm import tqdm
from collections import namedtuple
FILEPROPS=namedtuple("Fileprops", "parser num_fields column_indexes")
#CATEGORYLINKS_PARSER=re.compile(r'(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>\'.*?\'?),(?P<row3>\'[0-9\ \-:]+\'?),(?P<row4>\'\'?),(?P<row5>\'.*?\'?),(?P<row6>\'.*?\'?)')
CATEGORYLINKS_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>\'.*?\'?),(?P<row3>\'[0-9\ \-:]+\'?),(?P<row4>\'.*?\'?),(?P<row5>\'[a-z\-]*?\'?),(?P<row6>\'[a-z]+\'?)$')
PAGELINKS_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>[0-9]+?),(?P<row2>\'.*?\'?),(?P<row3>[0-9]+?)$')
REDIRECT_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>-?[0-9]+?),(?P<row2>\'.*?\'?),(?P<row3>\'.*?\'?),(?P<row4>\'.*?\'?)$')
CATEGORY_PARSER=re.compile(r'^(?P<row0>[0-9]+?),(?P<row1>\'.*?\'?),(?P<row2>[0-9]+?),(?P<row3>[0-9]+?),(?P<row4>[0-9]+?)$')
PAGE_PROPS_PARSER=re.compile(r'^([0-9]+),(\'.*?\'),(\'.*?\'),(\'[0-9\ \-:]+\'),(\'\'),(\'.*?\'),(\'.*?\')$')
PAGE_PARSER=re.compile((r'^(?P<row0>[0-9]+?),(?P<row1>[0-9]+?),(?P<row2>\'.*?\'?),(?P<row3>\'.*?\'?),(?P<row4>[0-9]+?),(?P<row5>[0-9]+?),(?P<row6>[0-9]?),'
r'(?P<row7>[0-9\.]+?),(?P<row8>\'.*?\'?),(?P<row9>(?P<row9val>\'.*?\'?)|(?P<row9null>NULL)),(?P<row10>[0-9]+?),(?P<row11>[0-9]+?),'
r'(?P<row12>(?P<row12val>\'.*?\'?)|(?P<row12null>NULL)),(?P<row13>(?P<row13val>\'.*?\'?)|(?P<row13null>NULL))$'))
"""
# page
`page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
`page_namespace` int(11) NOT NULL DEFAULT '0',
`page_title` varbinary(255) NOT NULL DEFAULT '',
`page_restrictions` tinyblob NOT NULL,
`page_counter` bigint(20) unsigned NOT NULL DEFAULT '0',
`page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT '0',
`page_is_new` tinyint(1) unsigned NOT NULL DEFAULT '0',
`page_random` double unsigned NOT NULL DEFAULT '0',
`page_touched` varbinary(14) NOT NULL DEFAULT '',
`page_links_updated` varbinary(14) DEFAULT NULL,
`page_latest` int(8) unsigned NOT NULL DEFAULT '0',
`page_len` int(8) unsigned NOT NULL DEFAULT '0',
`page_content_model` varbinary(32) DEFAULT NULL,
`page_lang` varbinary(35) DEFAULT NULL,
# pagelinks
`pl_from` int(8) unsigned NOT NULL DEFAULT '0',
`pl_namespace` int(11) NOT NULL DEFAULT '0',
`pl_title` varbinary(255) NOT NULL DEFAULT '',
`pl_from_namespace` int(11) NOT NULL DEFAULT '0',
"""
FILETYPE_PROPS=dict(
categorylinks=FILEPROPS(CATEGORYLINKS_PARSER, 7, (0, 1, 6)),
pagelinks=FILEPROPS(PAGELINKS_PARSER, 4, (0, 1, 2, 3)),
redirect=FILEPROPS(REDIRECT_PARSER, 5, (0, 1, 2)),
category=FILEPROPS(CATEGORY_PARSER, 5, (0, 1, 2, 3, 4)),
page_props=FILEPROPS(PAGE_PROPS_PARSER, 7, (0, 1)),
page=FILEPROPS(PAGE_PARSER, 14, (0, 1, 2, 5, 11, 12, 13)),
)
#VALUE_PARSER=re.compile(r'\(([0-9]+),(\'.*?\'),(\'.*?\'),(\'[0-9\ \-:]+\'),(\'\'),(\'.*?\'),(\'.*?\')\)')
def parse_match(match, column_indexes):
row = match.groupdict()
return tuple(row["row{}".format(i)] for i in column_indexes)
def parse_value(value, parser, column_indexes, value_idx=0, pbar=None):
# replace unicode dash with ascii dash
value = value.replace("\\xe2\\x80\\x93", "-")
parsed_correctly = False
for i, match in enumerate(parser.finditer(value)):
parsed_correctly = True
try:
row = parse_match(match, column_indexes)
yield row
except Exception as e:
print("Line: {!r}, Exception: {}".format(value, e), file=sys.stderr)
if not parsed_correctly:
print("Line: {!r}, IDX: {}, Exception: {}".format(value, value_idx, "Unable to parse."), file=sys.stderr)
def process_insert_values_line(line, parser, column_indexes, count_inserts=0, pbar=None):
start, partition, values = line.partition(' VALUES ')
# Each insert statement has format:
# INSERT INTO "table_name" VALUES (v1,v2,v3),(v1,v2,v3),(v1,v2,v3);
# When splitting by "),(" we need to only consider string from values[1:-2]
# This ignores the starting "(" and ending ");"
values = values.strip()[1:-2].split("),(")
pbar.set_postfix(found_values=len(values), insert_num=count_inserts)
for value_idx, value in enumerate(values):
for row in parse_value(value, parser, column_indexes, value_idx, pbar):
yield row
def process_file(fp, fp_out, filetype, column_indexes=None, silent=False):
if filetype not in FILETYPE_PROPS:
raise Exception("Invalid filetype: {}".format(filetype))
parser, num_fields, ci = FILETYPE_PROPS[filetype]
print("Parser: {}\nnum_fields: {}\nci: {}".format(parser, num_fields, ci))
valid_row_keys = set(["row{}".format(i) for i in range(num_fields)])
if column_indexes is None:
column_indexes = ci
with tqdm(disable=silent) as pbar:
count_inserts = 0
for line_no, line in enumerate(fp, start=1):
if line.startswith('INSERT INTO `{}` VALUES '.format(filetype)):
count_inserts += 1
for row in process_insert_values_line(
line, parser, column_indexes, count_inserts, pbar):
if pbar is not None:
pbar.update(1)
print("\t".join(row), file=fp_out)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("filename",
help="name of the wikipedia sql.gz file.")
parser.add_argument("filetype",
help="following filetypes are supported:\n[{}]".format(
", ".join(FILETYPE_PROPS.keys())
))
parser.add_argument("outputfile",
help="name of the output file")
parser.add_argument("--column-indexes", "-c", default=None,
help="column indexes to use in output file")
parser.add_argument("--silent", "-q", default=False, action="store_true",
help="Disable tqdm output.")
args = parser.parse_args()
print(args)
with gzip.open(args.filename, 'rt', encoding='ascii', errors='backslashreplace') as fp, open(args.outputfile, 'wt', encoding='utf-8') as fp_out:
process_file(fp, fp_out, args.filetype, column_indexes=args.column_indexes, silent=args.silent)
#with gzip.open("enwiki-20170920-categorylinks.sql.gz", 'rt') as fp, open("categorylinks.txt", "wt") as fp_out:
if __name__ == "__main__":
main()