-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathinliner.py
executable file
·366 lines (299 loc) · 14.7 KB
/
inliner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import subprocess
import argparse
import os
import sys
import shutil
import magic
import re
import base64
import codecs
import jsbeautifier
import logging
import unicodedata
log = logging.getLogger('inliner')
log.setLevel(logging.INFO)
log.addHandler(logging.StreamHandler(sys.stderr))
import cssutils
cssutils.log.setLevel(logging.FATAL)
from bs4 import BeautifulSoup
re_link = re.compile("<link(.(?!>))*..")
re_meta = re.compile("<meta(.(?!>))*..")
re_url = re.compile("url\s*\(\s*[\"']?((.(?!ata:))+?)[\"']?\s*\)")
def expand_url_carriers(style_declaration, file_map):
def replace(match):
group1 = match.group(1)
return ("url(data:%s;base64,%s)" %
(file_map[group1]['mime'], file_map[group1]['value'])) if (group1 != "" and group1 in file_map.keys()) else match.group(0)
url_carriers = []
url_carriers.append(style_declaration.getProperty('src'))
url_carriers.append(style_declaration.getProperty('background'))
url_carriers.append(style_declaration.getProperty('background-image'))
url_carriers.append(style_declaration.getProperty('list-style-image'))
for carrier in url_carriers:
if carrier is not None:
carrier.value = re.sub(re_url
, replace
, carrier.value)
def expand_css(file_map):
log.info('\nExpanding stylesheets...')
def expand_single_urls(stylesheet):
for rule in stylesheet:
if isinstance(rule, cssutils.css.CSSFontFaceRule) or isinstance(rule, cssutils.css.CSSStyleRule):
expand_url_carriers(rule.style, file_map)
return stylesheet
def expand_single_import(stylesheet):
imports = []
for i, rule in enumerate(stylesheet):
if isinstance(rule, cssutils.css.CSSImportRule):
imports.append({'index': i, 'rule': rule})
if len(imports) == 0:
return expand_single_urls(stylesheet)
else:
for im in imports:
deps = cssutils.parseString(file_map[im['rule'].href]['value'])
for j, dep in enumerate(deps):
stylesheet.insertRule(dep, index=im['index']+j+1)
stylesheet.deleteRule(im['index'])
return expand_single_import(stylesheet)
for key in file_map.keys():
if file_map[key]['mime'] == "text/css":
log.debug('- %s' % key)
stylesheet = cssutils.parseString(file_map[key]['value'])
stylesheet = expand_single_import(stylesheet)
file_map[key]['value'] = stylesheet.cssText
def inline(soup, downloadDir, file_map):
log.info('\nInlining resources...')
def inline_style(style):
style_declaration = cssutils.parseStyle(style)
expand_url_carriers(style_declaration, file_map)
return style_declaration.cssText
def inline_script(script_tag):
src = script_tag['src']
if src in file_map.keys():
type = "text/javascript"
if 'type' in script_tag.attrs:
type = script_tag['type']
tag = soup.new_tag("script", type=type, _from=src)
tag.append(soup.new_string(file_map[src]['value']))
return tag
else:
return script_tag
def inline_link(link_tag):
href = link_tag['href']
if href is not None and href in file_map.keys():
# NOTE: some file references, may not have been downloaded because
# the containing class or element is not used in the html file
# such file references will have 'absolute' paths, such as: http://cdn.foo.bar/img.jpg
# and won't be touched here, as it doesn't matter anyways in terms of
# correct display of the html page
type = None
if href.endswith('.js'):
type = 'text/javascript'
elif href.endswith('.css'):
type = 'text/css'
else:
for ext in ['png', 'gif', 'jpg', 'jpeg']:
if href.endswith(ext):
type = ('image/%s' % ext)
break
if type is None and (href.endswith('.ico') or ('type' in link_tag.attrs and link_tag['type'] == 'image/x-icon')):
type = 'image/x-icon'
if type is None and 'type' in link_tag.attrs:
type = link_tag['type']
if type is None and 'rel' in link_tag.attrs and link_tag['rel'] == u'stylesheet':
type = "text/css"
if type is None:
return link_tag
tag = None
if type is not None and 'css' in type:
tag = soup.new_tag("style", type=type, _from=href)
tag.append(soup.new_string(file_map[href]['value']))
elif type is not None and 'image' in type:
tag = soup.new_tag("link", type=type, _from=href, href="data:%s;base64,%s" % (file_map[href]['mime'], file_map[href]['value']))
else:
tag = soup.new_tag("script", type=type, _from=href)
tag.append(soup.new_string(file_map[href]['value']))
return tag
else:
return link_tag
def inline_video(video_tag):
src = video_tag['src']
sources = video_tag.find_all('source')
if src is not None and src != "" and src in file_map.keys():
video_tag['src'] = "data:%s;base64,%s" % (file_map[src]['mime'], file_map[src]['value'])
video_tag['_from'] = src
elif sources != []:
for source in sources:
src = source['src'].strip()
if src in file_map.keys():
mime = source['type'] if 'type' in source.attrs else file_map[src]['mime']
source['src'] = "data:%s;base64,%s" % (mime, file_map[src]['value'])
source['_from'] = src
return video_tag
def inline_img(img_tag):
src = img_tag['src']
if src in file_map.keys():
tag = soup.new_tag("img", _from=src, src=("data:%s;base64,%s" % (file_map[src]['mime'], file_map[src]['value'])))
return tag
else:
log.debug('Omitting resource %s' % src)
return img_tag
for tag in soup.find_all(lambda tag: "style" in tag.attrs and tag['style'] is not None):
tag['style'] = inline_style(tag['style'])
for tag in soup.find_all(lambda tag: "script" == tag.name and "src" in tag.attrs and tag['src'] is not None):
tag.replaceWith(inline_script(tag))
for tag in soup.find_all(lambda tag: "link" == tag.name and "href" in tag.attrs and tag['href'] is not None):
tag.replaceWith(inline_link(tag))
for tag in soup.find_all(lambda tag: "img" == tag.name and "src" in tag.attrs and tag['src'] is not None):
tag.replaceWith(inline_img(tag))
for tag in soup.find_all(lambda tag: "video" == tag.name):
tag.replaceWith(inline_video(tag))
return soup
def main():
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-u', '--uri', help='The URI to download and inline', required=True)
parser.add_argument('-d', '--dir', help='The local folder where retrieved data will be stored', required=True)
parser.add_argument('-i', '--inline', help='Inline the file of specified name from the local directory. If not specified, inliner will try to find the file automagically', required=False)
parser.add_argument('-l', '--local', action='store_true', default=False, help='Use content from local directory, do not download data', required=False)
parser.add_argument('-p', '--prettify', action='store_true', default=False, help='Prettify javscript', required=False)
parser.add_argument('-ni', '--no-images', action='store_true', default=False, help='Don\'t embed images', required=False)
parser.add_argument('-nf', '--no-fonts', action='store_true', default=False, help='Don\'t embed fonts', required=False)
parser.add_argument('-nv', '--no-videos', action='store_true', default=False, help='Don\'t embed videos', required=False)
parser.add_argument('-v', '--verbose', action='store_true', default=False, help="verbose output", required=False)
return parser.parse_args()
def assert_wget_installed():
try:
subprocess.check_output(['wget', '--version'])
except OSError:
log.critical("please install wget\n")
sys.exit(1)
def prepare_download_dir(path):
try:
os.stat(path)
if os.path.isdir(path):
while True:
log.info('\nDirectory %s exists.' % os.path.join(os.getcwd(),path))
log.info ('All content will be deleted. Do you want to continue? [y/n]')
input = sys.stdin.readline()
if input == 'n\n' or input == 'no\n':
sys.exit(0)
if input == 'y\n' or input == 'yes\n':
break
shutil.rmtree(path)
os.mkdir(path)
else:
log.critical ('%s is an existing file. Cowardly refusing to delete... goodbye\n')
sys.exit(0)
except OSError:
os.mkdir(path)
def run_wget(uri, dir):
try:
log.info("\nNow downloading files. Please wait...")
process = subprocess.Popen(['wget', '-p', '-k', '-nd', '-H', '-P', dir, uri], stderr=subprocess.PIPE)
for line in iter(process.stderr.readline, ''):
log.debug(line[:-1])
except OSError:
log.critical ("Error wgetting files.\n")
sys.exit(1)
def build_resource_map(downloaddir, inline_file=False, local=False, no_images=False, no_fonts=False, no_videos=False, prettify=False):
def read_text_file(file):
try:
return codecs.open(path, 'r', 'utf-8').read(), 'utf-8'
except:
return codecs.open(path, 'r', 'iso-8859-1').read(), 'iso-8859-1'
def fix_script_strings(s):
re_scr_str = re.compile("</script>")
return re.sub(re_scr_str, lambda match: "</' + 'cript>", s)
def get_mime(file):
if file.endswith('.js'):
return "text/javascript"
elif file.endswith('.html') or file.endswith('.xhtml'):
return "text/html"
elif file.endswith('.css'):
return "text/css"
else:
return magic.from_file(file, mime=True)
htmlsoup = None
file_map = {}
htmlfile = None
htmlencoding = None
log.info("\nBuilding resource map...")
log.debug("==========================================")
# loop files, find base html file, b64 encode images
for file in (unicodedata.normalize('NFC', f) for f in os.listdir(unicode(downloaddir))):
path = os.path.join(downloaddir, file)
mime = get_mime(path)
log.debug('- %s [%s]' % (file, mime))
if file == inline_file:
maybe_html,htmlenconding = read_text_file(file)
soup = BeautifulSoup(maybe_html)
if soup.html is None:
log.critical("The file specified by -f does not seem to be an html file. Aborting.")
sys.exit(1)
else:
htmlsoup = soup
continue
if 'text/' in mime:
maybe_html,encoding = read_text_file(file)
soup = BeautifulSoup(maybe_html)
if soup.html is not None:
if htmlfile is None or file == 'index.html': # index.html takes precedence
htmlfile = file
htmlencoding = encoding
# ok it is html, but often people do not close their link and meta tags
# which leads to malformed header data in soap - let's fix that
html = re.sub(re_link, lambda match: match.group(0).strip() if match.group(0).strip().endswith("/>") else match.group(0).strip()[:-1] + "/>", maybe_html)
html = re.sub(re_meta, lambda match: match.group(0).strip() if match.group(0).strip().endswith("/>") else match.group(0).strip()[:-1] + "/>", html)
htmlsoup = BeautifulSoup(html)
if 'javascript' in mime or file.endswith('.js'):
fixed = fix_script_strings(maybe_html)
file_map[file] = {'value': jsbeautifier.beautify(fixed) if prettify else fixed, 'mime': mime}
else:
file_map[file] = {'value': maybe_html, 'mime': mime}
if 'image/' in mime and not no_images:
image = open(path, 'r')
file_map[file] = {'value': base64.b64encode(image.read()), 'mime': mime}
if 'video/' in mime and not no_videos:
video = open(path, 'r')
file_map[file] = {'value': base64.b64encode(video.read()), 'mime': mime}
elif not no_fonts:
font_extensions = ['.eot', '.eot?', '.ttf', '.ttf?', '.woff', '.woff?']
for ext in font_extensions:
if file.endswith(ext):
font = open(path, 'r')
file_map[file] = {'value': base64.b64encode(font.read()), 'mime': mime}
break
if htmlfile is not None:
log.info("\nUsing %s" % htmlfile)
return htmlsoup, htmlencoding, file_map
# make sure wget is installed wget is used to download a webpage
# into a flat filestructure within a local folder while
# automatically converting the links/references accordingly
assert_wget_installed()
# get command line arguments
args = parse_args()
uri = args.uri
downloaddir = args.dir
inline_file = args.inline
local = args.local
no_images = args.no_images
no_fonts = args.no_fonts
no_videos = args.no_videos
prettify = args.prettify
verbose = args.verbose
if verbose:
log.setLevel(logging.DEBUG)
if not local:
prepare_download_dir(downloaddir)
run_wget(uri, downloaddir)
htmlsoup, encoding, file_map = build_resource_map(downloaddir, inline_file=inline_file, local=local, no_images=no_images, no_fonts=no_fonts, no_videos=no_videos, prettify=prettify)
if htmlsoup is not None:
expand_css(file_map)
soup = inline (htmlsoup, downloaddir, file_map)
print(soup.prettify().encode(encoding))
else:
log.critical("\nCould not find any html file to inline in folder: %s\n" % downloaddir)
sys.exit(1)
if __name__ == '__main__':
main()