-
Notifications
You must be signed in to change notification settings - Fork 34
/
lazyblorg.py
executable file
·647 lines (529 loc) · 26.1 KB
/
lazyblorg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python; -*-
PROG_VERSION = "Time-stamp: <2020-10-03 19:51:01 vk>"
PROG_VERSION_DATE = PROG_VERSION[13:23]
# TODO:
# * fix parts marked with «FIXXME»
## ===================================================================== ##
## You might not want to modify anything below this line if you do not ##
## know, what you are doing :-) ##
## ===================================================================== ##
import os
import logging
from datetime import datetime
from sys import exit, argv
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from lib.utils import *
from lib.orgparser import *
from lib.htmlizer import *
import pickle # for serializing and storing objects into files
from time import time # for measuring execution time
INVOCATION_TIME = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
EPILOG = "\n\
:copyright: (c) 2013 and following by Karl Voit <[email protected]>\n\
:license: GPL v3 or any later version\n\
:URL: https://github.com/novoid/lazyblorg\n\
:bugreports: via github (preferred) or <[email protected]>\n\
:version: " + PROG_VERSION_DATE + "\n"
class Lazyblorg(object):
"""
Central lazyblorg Class with main algorithm and methods
"""
options = None
logging = None
blog_data = []
metadata = [] # meta-data of the current run of lazyblorg
previous_metadata = None # meta-data of the previous run of lazyblorg
template_definitions = None # list of definitions of templates
# dict(year) of list(month) of list(day) of lists(entries) of IDs
entries_timeline_by_published = None
def __init__(self, options, logging):
self.options = options
self.logging = logging
self.blog_data = []
self.metadata = [] # meta-data of the current run of lazyblorg
self.previous_metadata = None # meta-data of the previous run of lazyblorg
self.template_definitions = None
def determine_changes(self):
"""
Parses input Org-mode files, reads in previous meta-data file,
and determines which articles changed in which way.
@param return: generate: list of IDs of articles in blog_data/metadata that should be build
@param return: marked_for_feed: list of IDs of articles in blog_data/metadata that are modified/new
@param return: increment_version: list of IDs of articles in blog_data/metadata that got a new version
"""
options = self.options
stats_parsed_org_files, stats_parsed_org_lines = 0, 0
logging.info("• Parsing Org mode files …")
for filename in options.orgfiles:
new_org_lines = 0
try:
file_blog_data, new_org_lines = self._parse_orgmode_file(
filename) # parsing one Org-mode file
except OrgParserException as message:
verbose_message = "Parsing error in file \"" + filename + \
"\" which is not good. Therefore, I stop here and hope you " + \
"can fix the issue in the Org-mode file. Reason: " + message.value
Utils.error_exit_with_userlog(
options.logfilename, 20, verbose_message)
else:
self.blog_data += file_blog_data
stats_parsed_org_files += 1
stats_parsed_org_lines += new_org_lines
# dump blogdata for debugging purpose ...
if options.verbose:
with open('2del-lazyblorg_dump_of_blogdata_from_previous_verbose_run.pk', 'wb') as output:
# always use ASCII format: easier to debug from outside
pickle.dump(self.blog_data, output)
# FIXXME: debug with: [x['id'] for x in self.blog_data]
# generate persistent data which is used to compare this status
# with the status of the next invocation:
self.metadata, self.entries_timeline_by_published = Utils.generate_metadata_from_blogdata(
self.blog_data)
# create path to new metadatafile if it does not exist:
if not os.path.isdir(os.path.dirname(options.new_metadatafilename)):
logging.debug(
"path of new_metadatafilename \"" +
options.new_metadatafilename +
"\" does not exist. Creating …")
os.makedirs(os.path.dirname(options.new_metadatafilename))
# write this status to the persistent data file:
with open(options.new_metadatafilename, 'wb') as output:
pickle.dump([self.metadata,
self.entries_timeline_by_published],
output)
# load old metadata from file
if os.path.isfile(options.previous_metadatafilename):
logging.debug(
"reading old \"" +
options.previous_metadatafilename +
"\" …")
with open(options.previous_metadatafilename, 'rb') as input:
[self.previous_metadata,
self.entries_timeline_by_published] = pickle.load(input)
# extract HTML templates and store in class var
self.template_definitions = self._generate_template_definitions_from_template_data()
# run comparing algorithm (last metadata, current metadata)
generate, marked_for_feed, increment_version = self._compare_blogdata_to_metadata()
return generate, marked_for_feed, increment_version, stats_parsed_org_files, stats_parsed_org_lines
def OLD_parse_HTML_output_template_and_generate_template_definitions(self):
"""
Parse the template Org-mode file which contains the HTML
definitions and store it into class variable
template_definitions.
@param return: True if success
"""
template_data = self._parse_HTML_output_template(
self.options.templatefilename)
self.template_definitions = self._generate_template_definitions_from_template_data(
template_data)
return True
def generate_output(self, generate, marked_for_feed, increment_version):
"""
Generates the htmlized output pages and the RSS/ATOM feeds.
@param generate: list of IDs of articles in blog_data/metadata that should be build
@param marked_for_feed: list of IDs of articles in blog_data/metadata that are modified/new
@param increment_version: list of IDs of articles in blog_data/metadata that got a new version
@param return:
"""
htmlizer = Htmlizer(
self.template_definitions,
config.TAG_FOR_BLOG_ENTRY,
self.options.targetdir,
self.blog_data,
self.metadata,
self.entries_timeline_by_published,
generate,
increment_version,
self.options.autotag_language,
self.options.ignore_missing_ids)
# FIXXME: try except HtmlizerException?
return htmlizer.run() # FIXXME: return value?
def _parse_orgmode_file(self, filename):
"""
This function handles the communication with the parser object and returns the blog data.
@param filename: string containing one file name
@param return: array containing parsed Org-mode data
"""
if os.path.isdir(filename):
self.logging.warning(
"Skipping directory \"%s\" because this tool only parses files." %
filename)
return
elif not os.path.isfile(filename):
self.logging.warning(
"Skipping \"%s\" because this tool only parses existing files. :-)" %
filename)
return
self.logging.debug("Parsing \"%s\" …" % filename)
parser = OrgParser(filename)
return parser.parse_orgmode_file()
def OLD_parse_HTML_output_template(self, filename):
"""
This function parses an Org-mode file which holds the definitions of the output format.
@param filename: string containing one file name of the definition file
@param return: dict containing parsed template definitions
"""
template_parser = OrgParser(filename)
return template_parser.parse_orgmode_file()
def _generate_template_definitions_from_template_data(self):
"""
This function checks for (only) basic format definitions and exits
if something important is missing.
@param return: list of HTML definitions as Org-mode HTML block list-elements
"""
self.logging.debug(
'checking for basic template definitions in parsed data …')
# extract template_data from blog_data:
template_data = [x for x in self.blog_data if x['id'] ==
'lazyblorg-templates' and x['title'] == 'Templates']
if not template_data:
message = "Sorry, no suitable template data could be parsed from the Org-mode files. " + \
"Please check if it meets all criteria as described in the original template " + \
"file \"blog-format.org\"."
Utils.error_exit_with_userlog(
self.options.logfilename, 40, message)
html_definitions = [x for x in template_data[0]['content']
if x[0] == 'html-block']
found_elements = [x[1] for x in html_definitions]
# for documentation about the implemented elements: see
# id:implemented-org-elements in dev/lazyblorg.org
for element in [
'common-sidebar',
'article-header',
'article-footer',
'article-header-begin',
'article-tags-begin',
'article-usertag',
'article-autotag-generic',
'article-autotag-language',
'article-tags-end',
'article-header-end',
'article-end',
'section-begin',
'paragraph',
'ul-begin',
'ul-item',
'ul-end',
'pre-begin',
'pre-end',
'entrypage-header',
'article-preview-header',
'article-preview-begin',
'article-preview-tags-begin',
'article-preview-usertag',
'article-preview-tags-end',
'article-preview-more',
'article-preview-end',
'entrypage-footer']:
if element not in found_elements:
message = "Sorry, no definition for element \"" + element + "\" could be found within " + \
"the template definition file. " + "Please check if you mistyped its name or similar."
Utils.error_exit_with_userlog(
self.options.logfilename, 42, message)
return html_definitions
def _compare_blogdata_to_metadata(self):
"""
In this function, the previous status (previous_metadata) is
compared to the status from the current parsing result
(metadata). It implements "Decision algorithm for generating
entries" as described in dev/lazyblorg.org.
The algorithm distinguishes between eight cases:
1) no ID found -> is not possible here any more since metadata
(dict) contains only entries with IDs -> should be done in parser:
WARN, ignore
2) new ID found -> generate, mark_for_feed
3) CREATED not found -> WARN, ignore
4) CREATED found but differs from previous run (should not change)
-> WARN, ignore
5) and 6) known and matching previous run: ID, CREATED, checksum
-> not changed (case 5/6 only differs in status of last timestamp)
-> generate
(FIXXME: in future, case 5 and 6 should result in "ignore". But
for this, I have to switch from "delete everything and re-generate
everything on every run" to "delete and re-generate only necessary
entries/pages")
7) known and matching: ID, CREATED, last timestamp; differs:
checksum -> silent update -> generate
8) known and matching: ID, CREATED; differs: checksum, last
timestamp -> normal update -> generate, mark_for_feed,
increment_version
The format of the metadata is described in dev/lazyblorg.org >
Notes > Internal format of meta-data.
@param return: generate: a list of metadata-entries that should be generated
@param return: marked_for_feed: a list of metadata-entries that are candidates to be propagated using RSS feeds
@param return: increment_version: a list of metadata-entries that can be marked with an increased update number
"""
self.logging.debug("compare_blog_metadata() called …")
generate = []
marked_for_feed = []
increment_version = []
metadata = self.metadata
previous_metadata = self.previous_metadata
if previous_metadata is None:
self.logging.info(
"no previous metadata found: must be the first run of lazyblorg with this configuration")
for entry in metadata: # ignore blog entries that have been gone since last run
# debug output current entry and its meta-data:
# FIXXME: had to disable the following lines because of:
# str([x[1] for x in sorted(entry.items(), key=lambda t: t[0])]))
# AttributeError: 'unicode' object has no attribute 'items'
# self.logging.debug(" processing entry [" + str(repr(entry)) +
# "] <--------------\nwith [checksum, created, timestamp]:\n md " +
# str([x[1] for x in sorted(entry.items(), key=lambda t: t[0])]))
if previous_metadata is not None:
if entry in list(previous_metadata.keys()):
self.logging.debug(
"\nprev " + str([x[1] for x in sorted(list(previous_metadata[entry].items()), key=lambda t: t[0])]))
else:
self.logging.debug(
"no previous metadata found for this entry")
if previous_metadata is None:
self.logging.debug(
"case 2: brand-new entry (first run of lazyblorg)")
generate.append(entry)
marked_for_feed.append(entry)
continue
if entry not in list(previous_metadata.keys()):
self.logging.debug(
"case 2: brand-new entry (lazyblorg was run previously)")
generate.append(entry)
marked_for_feed.append(entry)
continue
elif 'created' not in list(metadata[entry].keys()):
self.logging.debug(
"case 3: \"created\" missing -> WARN, ignore")
message = "entry [" + entry + \
"] is missing its CREATED property. Will be ignored, until you fix it."
Utils.append_logfile_entry(
self.options.logfilename, 'warn', message)
self.logging.warning(message)
continue
elif metadata[entry]['created'] != previous_metadata[entry]['created']:
self.logging.debug(
"case 4: \"created\" differs -> WARN, ignore")
message = "CREATED property of entry [" + entry + "] has changed which should never happen. " + \
"Entry will be ignored this run. Will be created next run if CREATED will not change any more."
Utils.append_logfile_entry(
self.options.logfilename, 'warn', message)
self.logging.warning(message)
continue
elif metadata[entry]['created'] == previous_metadata[entry]['created'] and \
metadata[entry]['checksum'] == previous_metadata[entry]['checksum']:
self.logging.debug("case 5 or 6: old entry -> generate")
generate.append(entry)
continue
elif metadata[entry]['created'] == previous_metadata[entry]['created'] and \
metadata[entry]['latestupdateTS'] == previous_metadata[entry]['latestupdateTS'] and \
metadata[entry]['checksum'] != previous_metadata[entry]['checksum']:
self.logging.debug("case 7: silent update -> generate")
generate.append(entry)
continue
elif metadata[entry]['created'] == previous_metadata[entry]['created'] and \
metadata[entry]['latestupdateTS'] != previous_metadata[entry]['latestupdateTS'] and \
metadata[entry]['checksum'] != previous_metadata[entry]['checksum']:
self.logging.debug(
"case 8: normal update -> generate, mark_for_feed, increment_version")
generate.append(entry)
marked_for_feed.append(entry)
increment_version.append(entry)
continue
else:
# warn (should never be reached)
message = "compare_blog_metadata() is confused with entry [" + entry + "] which " + \
"reached an undefined situation when comparing meta-data. You can re-run. " + \
"If this warning re-appears, please use \"--verbose\" and check entry."
Utils.append_logfile_entry(
self.options.logfilename, 'warn', message)
self.logging.warning(message)
return generate, marked_for_feed, increment_version
if __name__ == "__main__":
mydescription = "An Org-mode to HTML-blog system for very lazy people. Please refer to \n" + \
"https://github.com/novoid/lazyblorg for more information."
parser = ArgumentParser(prog=argv[0],
# keep line breaks in EPILOG and such
formatter_class=RawDescriptionHelpFormatter,
epilog=EPILOG,
description=mydescription)
parser.add_argument(
"--config",
dest="configfilename",
metavar='FILE',
required=False,
help="Path to an alternative configuration file.")
parser.add_argument(
"--orgfiles",
dest="orgfiles",
nargs='+',
metavar='ORGFILE',
required=True,
help="One or more Org-mode files which contains all blog articles (and possible other stuff).")
parser.add_argument(
"--targetdir",
dest="targetdir",
metavar='DIR',
required=True,
help="Path where the HTML files will be written to. " +
"On first run, this should be an empty directory.")
parser.add_argument(
"--previous-metadata",
dest="previous_metadatafilename",
metavar='FILE',
required=True,
help="Path to a file where persistent meta-data of the previous blog run " +
"was written to. It will be read and used for comparison to the current run, the current situation." +
"Therefore, this file can be missing in the first run.")
parser.add_argument(
"--new-metadata",
dest="new_metadatafilename",
metavar='FILE',
required=True,
help="Path to a file where persistent meta-data of the blog entries of the current run " +
"is written to.")
parser.add_argument(
"--logfile",
dest="logfilename",
metavar='ORGFILE',
required=True,
help="Path to a file where warnings (inactive time-stamps) and errors " +
"(active time-stamps) are being appended in Org-mode format. " +
"It is highly recommended, that you add this file to your agenda list.")
parser.add_argument(
"--autotag-language",
dest="autotag_language",
action="store_true",
help="Enable guessing of the language of a blog entry and using this as an auto-tag.")
parser.add_argument(
"-v",
"--verbose",
dest="verbose",
action="store_true",
help="Enable verbose mode which is quite chatty - be warned.")
parser.add_argument(
"-q",
"--quiet",
dest="quiet",
action="store_true",
help="Enable quiet mode: only warnings and errors will be reported.")
parser.add_argument(
"--ignore-missing-ids",
dest="ignore_missing_ids",
action="store_true",
help="Disable raised exception for missing IDs. Handy for running preview_blogentry.sh.")
parser.add_argument("--version", dest="version", action="store_true",
help="Display version and exit.")
options = parser.parse_args()
logging = Utils.initialize_logging(
"lazyblorg", options.verbose, options.quiet)
try:
if options.version:
print(os.path.basename(argv[0]) + " version " + PROG_VERSION_DATE)
exit(0)
# lazyblorg-global settings from "config.py"
if options.configfilename:
if not os.path.isfile(options.configfilename):
logging.critical(
"your alternative config file \"" +
options.configfilename +
"\" is not found. Please use default \"config.py\" or choose an existing file …")
Utils.error_exit(-1)
else:
import importlib.machinery
import importlib.util
loader = importlib.machinery.SourceFileLoader('config', options.configfilename)
spec = importlib.util.spec_from_loader(loader.name, loader)
mod = importlib.util.module_from_spec(spec)
loader.exec_module(mod)
else:
# load default config.py located at the lazyblorg source:
import config
if not os.path.isfile(options.logfilename):
logging.debug(
"log file \"" +
options.logfilename +
"\" is not found. Initializing with heading …")
with codecs.open(options.logfilename, 'w', encoding='utf-8') as outputhandle:
outputhandle.write(
"## -*- coding: utf-8 -*-\n" +
"## This file is best viewed with GNU Emacs Org-mode: http://orgmode.org/\n" +
"* Warnings and Error messages from lazyblorg :lazyblorg:log:\n\n" +
"Messages gets appended to this file. Please remove fixed issues manually.\n\n")
outputhandle.flush()
if options.verbose and options.quiet:
logging.error("Options \"--verbose\" and \"--quiet\" found. " +
"This does not make any sense, you silly fool :-)")
Utils.error_exit(1)
if not os.path.isdir(options.targetdir):
logging.critical(
"Target directory \"" +
options.targetdir +
"\" is not a directory. Aborting.")
Utils.error_exit(3)
if not os.path.isfile(options.previous_metadatafilename):
logging.warning(
"Blog data file \"" +
options.previous_metadatafilename +
"\" is not found. Assuming first run!")
logging.debug("extracting list of Org-mode files …")
logging.debug("len(orgfiles) [%s]" % str(len(options.orgfiles)))
if len(options.orgfiles) < 1:
logging.critical(
"Please add at least one Org-mode file name as argument")
Utils.error_exit(6)
# print file names if less than 10:
if len(options.orgfiles) < 10:
logging.debug("%s filenames found: [%s]" % (
str(len(options.orgfiles)), '], ['.join(options.orgfiles)))
else:
logging.debug("%s filenames found")
# main algorithm:
time_before_parsing = time()
lazyblorg = Lazyblorg(options, logging)
# FIXXME: encapsulate following lines in lazyblorg.run() ?
# lazyblorg.parse_HTML_output_template_and_generate_template_definitions()
generate, marked_for_feed, increment_version, stats_parsed_org_files, stats_parsed_org_lines = lazyblorg.determine_changes()
time_after_parsing = time()
logging.info(
"Parsed " +
str(stats_parsed_org_files) +
" Org-mode files with " +
str(stats_parsed_org_lines) +
" lines (in %.2f seconds)" %
(time_after_parsing -
time_before_parsing))
statistics_list = lazyblorg.generate_output(generate, marked_for_feed, increment_version)
# following lines seem inefficient but it allows me to add statistics in htmlizer without referencing here:
stats_generated_total = statistics_list[0]
stats_generated_temporal = statistics_list[1]
stats_generated_persistent = statistics_list[2]
stats_generated_tags = statistics_list[3]
stats_images_resized = statistics_list[4]
stats_external_org_to_html5_conversion = statistics_list[5]
stats_external_latex_to_html5_conversion = statistics_list[6]
time_after_htmlizing = time()
logging.info(
"Generated " +
str(stats_generated_total) +
" articles: " +
str(stats_generated_persistent) +
" persistent, " +
str(stats_generated_temporal) +
" temporal, " +
str(stats_generated_tags) +
" tag-pages" +
", the entry page, and scaled " +
str(stats_images_resized) +
" images (in %.2f seconds)" %
(time_after_htmlizing -
time_after_parsing))
logging.debug("Org mode snippets converted externally: " + str(stats_external_org_to_html5_conversion))
logging.debug("LaTeX snippets converted externally: " + str(stats_external_latex_to_html5_conversion))
logging.debug("-------------> cleaning up the stage …")
logging.debug("successfully finished.")
except KeyboardInterrupt:
logging.info("Received KeyboardInterrupt")
# END OF FILE ###########################################################
# Local Variables:
# DISABLEDmode: flyspell
# DISABLEDeval: (ispell-change-dictionary "en_US")
# End: