Coverage for gengive/gengive.py: 78.56%
302 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-04 18:32:40 +00:00
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-04 18:32:40 +00:00
1# -*- coding: utf-8 -*-
2# pylint: disable=expression-not-assigned,line-too-long
3"""Render text (Danish: gengive tekst)."""
4import base64
5import datetime as dti
6import hashlib
7import json
8import os
9import pathlib
10import shutil
11import sys
12from typing import Dict, Generator, Iterator, List, Optional, Tuple, Union
14import markdown
15from markdown.extensions import attr_list # types: ignore # noqa: F401
16from markdown.extensions import codehilite # types: ignore # noqa: F401
17from markdown.extensions import fenced_code # types: ignore # noqa: F401
18from markdown.extensions import meta # types: ignore # noqa: F401
19from markdown.extensions import tables # types: ignore # noqa: F401
20from markdown.extensions import toc # types: ignore # noqa: F401
22BINDER_PREFIX = 'bind-'
23BINDER_POSTFIX = '.txt'
24BUFFER_BYTES = 2 << 15
25CONFIG_NAME = 'render-config.json'
26DEBUG_VAR = 'GENGIVE_DEBUG'
27DEBUG = os.getenv(DEBUG_VAR)
28DEFAULT_CONFIG_NAME = '.gengive.json'
29DEFAULT_TARGET = 'default'
30ENCODING = 'utf-8'
31ENCODING_ERRORS_POLICY = 'ignore'
32HIDDEN = '.'
33PUBLISHER_ROOT_STR = os.getenv('GENGIVE_PUBLISHER_ROOT', '')
34PUBLISHER_ROOT = pathlib.Path(PUBLISHER_ROOT_STR) if PUBLISHER_ROOT_STR else pathlib.Path.cwd()
35RENDER_ROOT_STR = os.getenv('GENGIVE_RENDER_ROOT', '')
36RENDER_ROOT = pathlib.Path(RENDER_ROOT_STR) if RENDER_ROOT_STR else pathlib.Path.cwd()
37MEDIA_FOLDER_NAMES = os.getenv('GENGIVE_MEDIA_FOLDER_NAMES', 'diagrams,images,pictures').split(',')
38NON_MANUSCRIPT_FOLDERS = os.getenv('GENGIVE_NON_MANUSCRIPT_FOLDERS', 'bin,render')
39REPORT_TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S UTC'
41PathType = pathlib.Path
42ModelType = Dict[
43 str, Union[float, int, str, PathType, List[str], List[PathType], Dict[str, str], List[Dict[str, Union[int, str]]]]
44]
46STDIN, STDOUT = 'STDIN', 'STDOUT'
47DISPATCH = {
48 STDIN: sys.stdin,
49 STDOUT: sys.stdout,
50}
53def describe_file(file_path: PathType) -> Tuple[str, Union[dti.datetime, None], int]:
54 """Yield sha256 hash, modification date, and byte size tuple of file."""
55 if not file_path.is_file(): 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true
56 return 'cafebabe' * 8, None, 0
57 file_stats = file_path.stat()
58 file_size_bytes = file_stats.st_size
59 file_timestamp = dti.datetime.fromtimestamp(file_stats.st_mtime, dti.UTC)
60 with open(file_path, 'rb') as handle:
61 sha256_hash = hashlib.sha256() # noqa
62 for byte_block in iter(lambda in_f=handle: in_f.read(BUFFER_BYTES), b''): # type: ignore
63 sha256_hash.update(byte_block)
65 return sha256_hash.hexdigest(), file_timestamp, file_size_bytes
68def workspace_path() -> PathType:
69 """Derive the workspace from the module path of this script."""
70 return PUBLISHER_ROOT
73def manuscripts_available(workspace: PathType) -> Generator[str, None, None]:
74 """Retrieve a sorted sequence of available manuscripts adhering to naming convention."""
75 for candidate in sorted(
76 thing.name for thing in workspace.iterdir() if thing.is_dir() and not thing.name.startswith(HIDDEN)
77 ):
78 if candidate not in NON_MANUSCRIPT_FOLDERS:
79 yield candidate
82def variants_available(manuscript_path: PathType) -> Generator[str, None, None]:
83 """Retrieve a sorted sequence of available targets (variants) for manuscript adhering to naming conventions."""
84 for candidate in sorted(
85 thing.name for thing in manuscript_path.iterdir() if thing.is_file() and thing.name.startswith(BINDER_PREFIX)
86 ):
87 if candidate.endswith(BINDER_POSTFIX) and len(candidate) > len(BINDER_PREFIX + BINDER_POSTFIX): 87 ↛ 84line 87 didn't jump to line 84, because the condition on line 87 was never false
88 yield candidate.replace(BINDER_PREFIX, '').replace(BINDER_POSTFIX, '').lower()
91def parse_request(root_path: PathType, argv: List[str]) -> Tuple[int, str, PathType, str, str, str, PathType]:
92 """Verify the request and yield the quadruplet of error code, message, root_path, manuscript, and variant.
93 The shape of argv shall be verified already to contain manuscript and variant as the only two string items.
94 If error code is 0 message is empty and manuscript as well as variant are valid.
95 else error code can be used as process return code and message is non-empty.
96 """
97 command, publisher_root_str, manuscript, variant, render_root_str = argv[:]
98 publisher_root = pathlib.Path(publisher_root_str)
99 render_root = pathlib.Path(render_root_str)
100 if command == 'verify':
101 print('Note: Dry run - verification mode.')
103 if manuscript:
104 m_path = pathlib.Path(manuscript)
105 if m_path.is_dir(): 105 ↛ 110line 105 didn't jump to line 110, because the condition on line 105 was never false
106 publisher_root = m_path.parent
107 manuscript = m_path.name
108 print(f'Updating publisher root from {root_path} to {publisher_root} ...')
110 print(f'Retrieving manuscript folders below publisher root {publisher_root} ...')
111 manuscripts = tuple(manuscripts_available(publisher_root))
112 for available in sorted(manuscripts):
113 print(f'- {available}')
115 if manuscript not in manuscripts:
116 message = f'Document({manuscript}) is not available within publisher root {publisher_root}'
117 return 1, message, publisher_root, '', '', '', render_root
119 print(f'Identifying variants defined for document({manuscript}) ...')
120 manuscript_path = publisher_root / manuscript
121 variants = tuple(variants_available(manuscript_path))
122 for available in sorted(variants):
123 print(f'- {available}')
125 if variant not in variants: 125 ↛ 126line 125 didn't jump to line 126
126 message = (
127 f'Target({variant}) is not defined for document({manuscript})'
128 f' - you may want to add a {manuscript}/bind-{variant}.txt file'
129 )
130 return 1, message, publisher_root, '', '', '', render_root
132 print(
133 f'Requested rendering document({manuscript}) for target({variant})'
134 f' below {render_root}/render/{manuscript}/{variant}/ ...'
135 )
136 return 0, '', publisher_root, command, manuscript, variant, render_root
139def load_config(config_folder: PathType, variant: str) -> Tuple[int, str, Dict[str, str]]:
140 """Load the render configuration for the variant of the manuscript.
141 If error code is 0 message is empty and the dict contains the mappings for rendition.
142 Else error code can be used as process return code and message is non-empty.
143 """
144 config_path = config_folder / CONFIG_NAME
145 if not config_path.is_file() or not config_path.stat().st_size: 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true
146 return 1, f'Configuration at {config_path} is no file or empty', {}
148 with open(config_path, 'rt', encoding=ENCODING) as handle:
149 full_config = json.load(handle)
151 if not full_config.get(variant): 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true
152 return 1, f'Configuration at {config_path} lacks the variant key {variant} or the value is empty', {}
154 config = full_config[variant]
155 if not config.get('name'): 155 ↛ 156line 155 didn't jump to line 156, because the condition on line 155 was never true
156 return 1, f'Configuration at {config_path} for variant key {variant} lacks name (non-empty) entry', {}
158 if ' ' in config['name']: 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true
159 return 1, f'The value of the {variant} "name" member SHALL NOT contain spaces in {config_path}', {}
161 if config.get('css'): 161 ↛ 168line 161 didn't jump to line 168, because the condition on line 161 was never false
162 css_path = config_folder / config['css']
163 if not css_path.is_file(): 163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true
164 return 1, f'The value of the {variant} "css" member if present SHALL be a file path {css_path}', {}
165 with open(css_path, 'rt', encoding=ENCODING) as handle:
166 config['css_declarations'] = handle.read()
168 return 0, '', config
171def load_binder(binder_folder: PathType, variant: str) -> Tuple[int, str, List[PathType]]:
172 """Load the binder for the variant of the manuscript.
173 If error code is 0 message is empty and the list contains the file paths for binding all files into a single file.
174 Else error code can be used as process return code and message is non-empty.
175 """
176 binder_path = binder_folder / f'bind-{variant}.txt'
177 if not binder_path.is_file() or not binder_path.stat().st_size: 177 ↛ 178line 177 didn't jump to line 178, because the condition on line 177 was never true
178 return 1, f'Binder at {binder_path} is no file or empty', []
180 with open(binder_path, 'rt', encoding=ENCODING) as handle:
181 binder = [binder_folder / line.strip() for line in handle.readlines() if line.strip()]
183 if not binder: 183 ↛ 184line 183 didn't jump to line 184, because the condition on line 183 was never true
184 return 1, f'Binder has no entries at {binder_path}', []
186 if not all(path.is_file() for path in binder): 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true
187 failing = [path.name for path in binder if not path.is_file()]
188 return 1, f'Failed to find files for binder entries: ({", ".join(failing)}) in {binder_path}', []
190 return 0, '', binder
193def bind_parts(binder: List[PathType], collation_path: PathType) -> List[str]:
194 """Given a list of paths read those files and write the concat document to collation path.
195 Return the in memory document as lis of lines.
196 """
197 in_mem_doc = []
198 for path in binder:
199 with open(path, 'rt', encoding=ENCODING) as handle:
200 part = [line.rstrip('\n').replace('\r', '') for line in handle.readlines()]
201 if part: 201 ↛ 199line 201 didn't jump to line 199
202 in_mem_doc.extend(part)
203 in_mem_doc.append('') # TODO(sthagen) use re-format later
204 with open(collation_path, 'wt', encoding=ENCODING) as handle:
205 handle.write('\n'.join(in_mem_doc))
207 return in_mem_doc
210def render_html(collation_path: PathType, collation_name: str, html_path: PathType, css: str) -> None:
211 """Render the HTML from the markdown."""
212 extensions = ['attr_list', 'codehilite', 'fenced_code', 'tables', 'toc']
214 md_processor = markdown.Markdown(extensions=extensions, output_format='html')
215 with open(collation_path, 'rt', encoding=ENCODING) as handle:
216 html_body_content = md_processor.convert(handle.read())
218 prefix = f"""<!DOCTYPE html>
219 <html lang="en">
220 <head>
221 <meta charset="utf-8">
222 <meta name="description" content="Some Documents '{collation_name}'.">
223 <meta name="viewport" content="width=device-width, initial-scale=1">
224 <style>
225 {css}
226 </style>
227 <title>{collation_name}</title>
228 </head>
229 <body>
230 """
231 postfix = """
232 </body>
233 </html>
234 """
235 with open(html_path, 'wt', encoding=ENCODING) as writer:
236 writer.write(prefix)
237 writer.write(html_body_content)
238 writer.write(postfix)
241def extract_media_selection(in_mem_md_doc, manuscript_path: PathType): # type: ignore
242 """Extract the select media collection from intersection of file tree and document analysis."""
243 condition_asset_mentions = []
244 for line in in_mem_md_doc:
245 if any(f'{name}/' in line for name in MEDIA_FOLDER_NAMES):
246 condition_asset_mentions.append(line)
247 lines_with_asset_mentions = '\n'.join(condition_asset_mentions)
249 media_selection = []
250 for asset_folder_name in MEDIA_FOLDER_NAMES:
251 for path_str in sorted(pathlib.Path(manuscript_path / asset_folder_name).glob('**/*')):
252 asset_path_str = str(pathlib.Path(asset_folder_name, path_str.name)).replace('\\', '/')
253 if asset_path_str in lines_with_asset_mentions: 253 ↛ 251line 253 didn't jump to line 251, because the condition on line 253 was never false
254 media_selection.append(asset_path_str)
256 return media_selection
259def copy_media_assets(manuscript_path: PathType, media_selection: List[str], html_folder: PathType) -> None:
260 """Copy select media assets from the MEDIA_FOLDER_NAMES folders to the render tree."""
261 for asset_folder_name in MEDIA_FOLDER_NAMES:
262 for path_str in sorted(pathlib.Path(manuscript_path / asset_folder_name).glob('**/*')):
263 asset_path_str = str(pathlib.Path(asset_folder_name, path_str.name)).replace('\\', '/')
264 if asset_path_str in media_selection: 264 ↛ 262line 264 didn't jump to line 262, because the condition on line 264 was never false
265 src = manuscript_path / asset_path_str
266 (html_folder / asset_folder_name).mkdir(parents=True, exist_ok=True)
267 dest = html_folder / asset_path_str
268 shutil.copy2(src, dest)
271def collect_asset_descriptions(media_selection, manuscript_path: PathType): # type: ignore
272 """LATER"""
273 asset_descriptions = []
274 for asset_folder_name in MEDIA_FOLDER_NAMES:
275 for path_str in sorted(pathlib.Path(manuscript_path / asset_folder_name).glob('**/*')):
276 asset_path_str = str(pathlib.Path(asset_folder_name, path_str.name)).replace('\\', '/')
277 if asset_path_str in media_selection: 277 ↛ 275line 277 didn't jump to line 275, because the condition on line 277 was never false
278 src_path = manuscript_path / asset_folder_name / path_str.name
279 a_hash, mod_at, size_bytes = describe_file(src_path)
280 asset_descriptions.append((asset_path_str, a_hash, mod_at, size_bytes))
281 if mod_at is None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true
282 print(f'ERROR: media asset file ({src_path}) is not accessible for report of request?')
283 return 1
285 asset_descriptions_reshaped = []
286 for path_str, a_hash, mod_at, size_bytes in asset_descriptions: # type: ignore
287 asset_descriptions_reshaped.append(
288 {
289 'asset_path_str': path_str,
290 'asset_hash_sha256': a_hash,
291 'asset_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT) if mod_at else None,
292 'asset_size_bytes': size_bytes,
293 }
294 )
296 return asset_descriptions_reshaped
299def document_rendering(model: ModelType, collation_folder: PathType, stdout: bool = False) -> None:
300 """Provide necessary and sufficient information linking the renditions with request time information.
301 The timestamps, manuscript, variant, as well as checksums identifying source and target files,
302 """
303 collation_report_path = collation_folder / 'render-info.json'
305 report: ModelType = {**model}
306 for key in report.keys():
307 if isinstance(report[key], List) and report[key]:
308 values: List[PathType] = report[key] # type: ignore
309 if isinstance(values[0], PathType):
310 report[key] = [str(value) for value in values]
311 elif isinstance(report[key], pathlib.Path):
312 report[key] = str(report[key])
314 if not stdout:
315 with open(collation_report_path, 'wt', encoding=ENCODING) as handle:
316 json.dump(report, handle, indent=2)
317 else:
318 print(json.dumps(report, indent=2))
321def reader(path: str) -> Iterator[str]:
322 """Context wrapper / generator to read the lines."""
323 with open(pathlib.Path(path), 'rt', encoding=ENCODING) as handle: 323 ↛ exitline 323 didn't return from function 'reader'
324 for line in handle: 324 ↛ 323line 324 didn't jump to line 323
325 yield line
328def verify_request(argv: Optional[List[str]]) -> Tuple[int, str, List[str]]:
329 """Fail with grace."""
330 if not argv or len(argv) != 3:
331 return 2, 'received wrong number of arguments', ['']
333 command, wun, two = argv
334 if command not in ('render' 'verify',): 334 ↛ 337line 334 didn't jump to line 337, because the condition on line 334 was never false
335 return 2, 'received unknown command', ['']
337 if command == 'verify':
338 inp, config = wun, two
340 if inp:
341 if not pathlib.Path(str(inp)).is_file():
342 return 1, 'source is no file', ['']
344 if not config:
345 return 2, 'configuration missing', ['']
347 config_path = pathlib.Path(str(config))
348 if not config_path.is_file():
349 return 1, f'config ({config_path}) is no file', ['']
350 if not ''.join(config_path.suffixes).lower().endswith('.json'):
351 return 1, 'config has no .json extension', ['']
353 return 0, '', argv
355 manuscript, target = wun, two
357 if manuscript:
358 if not pathlib.Path(str(manuscript)).is_dir():
359 return 1, 'manuscript is no folder', ['']
361 if not target:
362 return 2, 'target missing', ['']
364 return 0, '', argv
367def main(argv: Union[List[str], None] = None) -> int:
368 """Drive the request, discover, rendering, and reporting processes."""
369 argv = sys.argv[1:] if argv is None else argv
371 if not argv or not isinstance(argv, list) or len(argv) != 5: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true
372 print('For usage info: render --help')
373 return 2
375 processing_start = dti.datetime.now(dti.UTC)
376 root_path = workspace_path()
377 error_code, message, root_path, command, manuscript, variant, render_path = parse_request(root_path, argv)
378 if error_code: 378 ↛ 379line 378 didn't jump to line 379, because the condition on line 378 was never true
379 print(f'ERROR: {message}')
380 return error_code
381 model: ModelType = {
382 'request_parameters': argv,
383 'processing_start': processing_start.strftime(REPORT_TIMESTAMP_FORMAT),
384 'manuscript': manuscript,
385 'variant': variant,
386 }
387 manuscript_path = root_path / manuscript
388 error_code, message, render_config = load_config(manuscript_path, variant)
389 if error_code: 389 ↛ 390line 389 didn't jump to line 390, because the condition on line 389 was never true
390 print(f'ERROR: {message}')
391 return error_code
392 css = render_config.get('css_declarations', '')
393 if css: 393 ↛ 395line 393 didn't jump to line 395, because the condition on line 393 was never false
394 render_config['css_declarations'] = base64.b64encode(css.encode(ENCODING)).decode(ENCODING)
395 a_path = manuscript_path / CONFIG_NAME
396 a_hash, mod_at, size_bytes = describe_file(a_path)
397 if mod_at is None: 397 ↛ 398line 397 didn't jump to line 398, because the condition on line 397 was never true
398 print(f'ERROR: configuration file ({a_path}) is not accessible for report of request?')
399 return 1
401 model = {
402 **model,
403 'manuscript_path': manuscript_path,
404 'config_path': a_path,
405 'config_hash_sha256': a_hash,
406 'config_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),
407 'config_size_bytes': size_bytes,
408 'render_config': render_config,
409 }
411 error_code, message, binder = load_binder(manuscript_path, variant)
412 if error_code: 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true
413 print(f'ERROR: {message}')
414 return error_code
415 a_path = manuscript_path / f'bind-{variant}.txt'
416 a_hash, mod_at, size_bytes = describe_file(a_path)
417 if mod_at is None: 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true
418 print(f'ERROR: binder definition file ({a_path}) is not accessible for report of request?')
419 return 1
421 model = {
422 **model,
423 'binder_path': a_path,
424 'binder_hash_sha256': a_hash,
425 'binder_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),
426 'binder_size_bytes': size_bytes,
427 'binder': binder,
428 }
430 print('Binder analysis OK, all files resolve. Sequence of binding will be:')
431 for rank, part in enumerate(binder, start=1):
432 print(f'{rank :>2d}: {part}')
434 collation_folder = render_path / 'render' / manuscript / variant
435 collation_folder.mkdir(parents=True, exist_ok=True)
436 collation_name = f'{render_config["name"]}.md'
437 collation_path = collation_folder / collation_name
438 if command == 'verify':
439 document_rendering(model, collation_folder, stdout=True)
440 return 0
442 print(f'Binding source documents from ({manuscript}) for target({variant}) to {collation_path} ...')
443 in_mem_md_doc = bind_parts(binder, collation_path)
444 lines_written = len(in_mem_md_doc)
445 print(f'- Written {lines_written} lines from {len(binder)} parts to {collation_path}')
446 a_hash, mod_at, size_bytes = describe_file(collation_path)
447 if mod_at is None: 447 ↛ 448line 447 didn't jump to line 448, because the condition on line 447 was never true
448 print(f'ERROR: collation markdown file ({collation_path}) is not accessible for report of request?')
449 return 1
451 model = {
452 **model,
453 'collation_folder': collation_folder,
454 'collation_name': collation_name,
455 'collation_path': collation_path,
456 'collation_hash_sha256': a_hash,
457 'collation_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),
458 'collation_size_bytes': size_bytes,
459 'lines_written': lines_written,
460 }
462 html_folder = collation_folder / 'html'
463 html_folder.mkdir(parents=True, exist_ok=True)
464 html_name = f'{render_config["name"]}.html'
465 html_path = html_folder / html_name
466 print(f'Writing HTML rendition from ({manuscript}) for target({variant}) to {html_path} ...')
468 print(f'Creating HTML rendition of document({manuscript}) for target({variant}) below {html_folder}/ ...')
469 render_html(collation_path, collation_name, html_path, css=css)
470 a_hash, mod_at, size_bytes = describe_file(html_path)
471 if mod_at is None: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true
472 print(f'ERROR: rendered HTML file ({html_path}) is not accessible for report of request?')
473 return 1
475 model = {
476 **model,
477 'html_folder': html_folder,
478 'html_name': html_name,
479 'html_path': html_path,
480 'html_hash_sha256': a_hash,
481 'html_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),
482 'html_size_bytes': size_bytes,
483 }
485 print('Determine set of media assets in use ...')
486 media_selection = extract_media_selection(in_mem_md_doc, manuscript_path)
488 print(f'Copying the per conventions {len(MEDIA_FOLDER_NAMES)} media asset folders from source to target ...')
489 copy_media_assets(manuscript_path, media_selection, html_folder)
491 model['asset_descriptions'] = collect_asset_descriptions(media_selection, manuscript_path)
493 print(f'Done. Entrypoint is {html_path}')
494 processing_stop = dti.datetime.now(dti.UTC)
495 model = {
496 **model,
497 'processing_stop': processing_stop.strftime(REPORT_TIMESTAMP_FORMAT),
498 'processing_duration_seconds': (processing_stop - processing_start).total_seconds(),
499 'render_config': render_config,
500 }
501 document_rendering(model, collation_folder)
502 return 0