Coverage for gengive/gengive.py: 78.56%

1# -*- coding: utf-8 -*-

2# pylint: disable=expression-not-assigned,line-too-long

3"""Render text (Danish: gengive tekst)."""

4import base64

5import datetime as dti

6import hashlib

7import json

8import os

9import pathlib

10import shutil

11import sys

12from typing import Dict, Generator, Iterator, List, Optional, Tuple, Union

14import markdown

15from markdown.extensions import attr_list # types: ignore # noqa: F401

16from markdown.extensions import codehilite # types: ignore # noqa: F401

17from markdown.extensions import fenced_code # types: ignore # noqa: F401

18from markdown.extensions import meta # types: ignore # noqa: F401

19from markdown.extensions import tables # types: ignore # noqa: F401

20from markdown.extensions import toc # types: ignore # noqa: F401

22BINDER_PREFIX = 'bind-'

23BINDER_POSTFIX = '.txt'

24BUFFER_BYTES = 2 << 15

25CONFIG_NAME = 'render-config.json'

26DEBUG_VAR = 'GENGIVE_DEBUG'

27DEBUG = os.getenv(DEBUG_VAR)

28DEFAULT_CONFIG_NAME = '.gengive.json'

29DEFAULT_TARGET = 'default'

30ENCODING = 'utf-8'

31ENCODING_ERRORS_POLICY = 'ignore'

32HIDDEN = '.'

33PUBLISHER_ROOT_STR = os.getenv('GENGIVE_PUBLISHER_ROOT', '')

34PUBLISHER_ROOT = pathlib.Path(PUBLISHER_ROOT_STR) if PUBLISHER_ROOT_STR else pathlib.Path.cwd()

35RENDER_ROOT_STR = os.getenv('GENGIVE_RENDER_ROOT', '')

36RENDER_ROOT = pathlib.Path(RENDER_ROOT_STR) if RENDER_ROOT_STR else pathlib.Path.cwd()

37MEDIA_FOLDER_NAMES = os.getenv('GENGIVE_MEDIA_FOLDER_NAMES', 'diagrams,images,pictures').split(',')

38NON_MANUSCRIPT_FOLDERS = os.getenv('GENGIVE_NON_MANUSCRIPT_FOLDERS', 'bin,render')

39REPORT_TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S UTC'

41PathType = pathlib.Path

42ModelType = Dict[

43 str, Union[float, int, str, PathType, List[str], List[PathType], Dict[str, str], List[Dict[str, Union[int, str]]]]

44]

46STDIN, STDOUT = 'STDIN', 'STDOUT'

47DISPATCH = {

48 STDIN: sys.stdin,

49 STDOUT: sys.stdout,

50}

53def describe_file(file_path: PathType) -> Tuple[str, Union[dti.datetime, None], int]:

54 """Yield sha256 hash, modification date, and byte size tuple of file."""

55 if not file_path.is_file(): 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true

56 return 'cafebabe' * 8, None, 0

57 file_stats = file_path.stat()

58 file_size_bytes = file_stats.st_size

59 file_timestamp = dti.datetime.fromtimestamp(file_stats.st_mtime, dti.UTC)

60 with open(file_path, 'rb') as handle:

61 sha256_hash = hashlib.sha256() # noqa

62 for byte_block in iter(lambda in_f=handle: in_f.read(BUFFER_BYTES), b''): # type: ignore

63 sha256_hash.update(byte_block)

65 return sha256_hash.hexdigest(), file_timestamp, file_size_bytes

68def workspace_path() -> PathType:

69 """Derive the workspace from the module path of this script."""

70 return PUBLISHER_ROOT

73def manuscripts_available(workspace: PathType) -> Generator[str, None, None]:

74 """Retrieve a sorted sequence of available manuscripts adhering to naming convention."""

75 for candidate in sorted(

76 thing.name for thing in workspace.iterdir() if thing.is_dir() and not thing.name.startswith(HIDDEN)

77 ):

78 if candidate not in NON_MANUSCRIPT_FOLDERS:

79 yield candidate

82def variants_available(manuscript_path: PathType) -> Generator[str, None, None]:

83 """Retrieve a sorted sequence of available targets (variants) for manuscript adhering to naming conventions."""

84 for candidate in sorted(

85 thing.name for thing in manuscript_path.iterdir() if thing.is_file() and thing.name.startswith(BINDER_PREFIX)

86 ):

87 if candidate.endswith(BINDER_POSTFIX) and len(candidate) > len(BINDER_PREFIX + BINDER_POSTFIX): 87 ↛ 84line 87 didn't jump to line 84, because the condition on line 87 was never false

88 yield candidate.replace(BINDER_PREFIX, '').replace(BINDER_POSTFIX, '').lower()

91def parse_request(root_path: PathType, argv: List[str]) -> Tuple[int, str, PathType, str, str, str, PathType]:

92 """Verify the request and yield the quadruplet of error code, message, root_path, manuscript, and variant.

93 The shape of argv shall be verified already to contain manuscript and variant as the only two string items.

94 If error code is 0 message is empty and manuscript as well as variant are valid.

95 else error code can be used as process return code and message is non-empty.

96 """

97 command, publisher_root_str, manuscript, variant, render_root_str = argv[:]

98 publisher_root = pathlib.Path(publisher_root_str)

99 render_root = pathlib.Path(render_root_str)

100 if command == 'verify':

101 print('Note: Dry run - verification mode.')

102

103 if manuscript:

104 m_path = pathlib.Path(manuscript)

105 if m_path.is_dir(): 105 ↛ 110line 105 didn't jump to line 110, because the condition on line 105 was never false

106 publisher_root = m_path.parent

107 manuscript = m_path.name

108 print(f'Updating publisher root from {root_path} to {publisher_root} ...')

109

110 print(f'Retrieving manuscript folders below publisher root {publisher_root} ...')

111 manuscripts = tuple(manuscripts_available(publisher_root))

112 for available in sorted(manuscripts):

113 print(f'- {available}')

114

115 if manuscript not in manuscripts:

116 message = f'Document({manuscript}) is not available within publisher root {publisher_root}'

117 return 1, message, publisher_root, '', '', '', render_root

118

119 print(f'Identifying variants defined for document({manuscript}) ...')

120 manuscript_path = publisher_root / manuscript

121 variants = tuple(variants_available(manuscript_path))

122 for available in sorted(variants):

123 print(f'- {available}')

124

125 if variant not in variants: 125 ↛ 126line 125 didn't jump to line 126

126 message = (

127 f'Target({variant}) is not defined for document({manuscript})'

128 f' - you may want to add a {manuscript}/bind-{variant}.txt file'

129 )

130 return 1, message, publisher_root, '', '', '', render_root

131

132 print(

133 f'Requested rendering document({manuscript}) for target({variant})'

134 f' below {render_root}/render/{manuscript}/{variant}/ ...'

135 )

136 return 0, '', publisher_root, command, manuscript, variant, render_root

137

138

139def load_config(config_folder: PathType, variant: str) -> Tuple[int, str, Dict[str, str]]:

140 """Load the render configuration for the variant of the manuscript.

141 If error code is 0 message is empty and the dict contains the mappings for rendition.

142 Else error code can be used as process return code and message is non-empty.

143 """

144 config_path = config_folder / CONFIG_NAME

145 if not config_path.is_file() or not config_path.stat().st_size: 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true

146 return 1, f'Configuration at {config_path} is no file or empty', {}

147

148 with open(config_path, 'rt', encoding=ENCODING) as handle:

149 full_config = json.load(handle)

150

151 if not full_config.get(variant): 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true

152 return 1, f'Configuration at {config_path} lacks the variant key {variant} or the value is empty', {}

153

154 config = full_config[variant]

155 if not config.get('name'): 155 ↛ 156line 155 didn't jump to line 156, because the condition on line 155 was never true

156 return 1, f'Configuration at {config_path} for variant key {variant} lacks name (non-empty) entry', {}

157

158 if ' ' in config['name']: 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true

159 return 1, f'The value of the {variant} "name" member SHALL NOT contain spaces in {config_path}', {}

160

161 if config.get('css'): 161 ↛ 168line 161 didn't jump to line 168, because the condition on line 161 was never false

162 css_path = config_folder / config['css']

163 if not css_path.is_file(): 163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true

164 return 1, f'The value of the {variant} "css" member if present SHALL be a file path {css_path}', {}

165 with open(css_path, 'rt', encoding=ENCODING) as handle:

166 config['css_declarations'] = handle.read()

167

168 return 0, '', config

169

170

171def load_binder(binder_folder: PathType, variant: str) -> Tuple[int, str, List[PathType]]:

172 """Load the binder for the variant of the manuscript.

173 If error code is 0 message is empty and the list contains the file paths for binding all files into a single file.

174 Else error code can be used as process return code and message is non-empty.

175 """

176 binder_path = binder_folder / f'bind-{variant}.txt'

177 if not binder_path.is_file() or not binder_path.stat().st_size: 177 ↛ 178line 177 didn't jump to line 178, because the condition on line 177 was never true

178 return 1, f'Binder at {binder_path} is no file or empty', []

179

180 with open(binder_path, 'rt', encoding=ENCODING) as handle:

181 binder = [binder_folder / line.strip() for line in handle.readlines() if line.strip()]

182

183 if not binder: 183 ↛ 184line 183 didn't jump to line 184, because the condition on line 183 was never true

184 return 1, f'Binder has no entries at {binder_path}', []

185

186 if not all(path.is_file() for path in binder): 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true

187 failing = [path.name for path in binder if not path.is_file()]

188 return 1, f'Failed to find files for binder entries: ({", ".join(failing)}) in {binder_path}', []

189

190 return 0, '', binder

191

192

193def bind_parts(binder: List[PathType], collation_path: PathType) -> List[str]:

194 """Given a list of paths read those files and write the concat document to collation path.

195 Return the in memory document as lis of lines.

196 """

197 in_mem_doc = []

198 for path in binder:

199 with open(path, 'rt', encoding=ENCODING) as handle:

200 part = [line.rstrip('\n').replace('\r', '') for line in handle.readlines()]

201 if part: 201 ↛ 199line 201 didn't jump to line 199

202 in_mem_doc.extend(part)

203 in_mem_doc.append('') # TODO(sthagen) use re-format later

204 with open(collation_path, 'wt', encoding=ENCODING) as handle:

205 handle.write('\n'.join(in_mem_doc))

206

207 return in_mem_doc

208

209

210def render_html(collation_path: PathType, collation_name: str, html_path: PathType, css: str) -> None:

211 """Render the HTML from the markdown."""

212 extensions = ['attr_list', 'codehilite', 'fenced_code', 'tables', 'toc']

213

214 md_processor = markdown.Markdown(extensions=extensions, output_format='html')

215 with open(collation_path, 'rt', encoding=ENCODING) as handle:

216 html_body_content = md_processor.convert(handle.read())

217

218 prefix = f"""<!DOCTYPE html>

219 <html lang="en">

220 <head>

221 <meta charset="utf-8">

222 <meta name="description" content="Some Documents '{collation_name}'.">

223 <meta name="viewport" content="width=device-width, initial-scale=1">

224 <style>

225 {css}

226 </style>

227 <title>{collation_name}</title>

228 </head>

229 <body>

230 """

231 postfix = """

232 </body>

233 </html>

234 """

235 with open(html_path, 'wt', encoding=ENCODING) as writer:

236 writer.write(prefix)

237 writer.write(html_body_content)

238 writer.write(postfix)

239

240

241def extract_media_selection(in_mem_md_doc, manuscript_path: PathType): # type: ignore

242 """Extract the select media collection from intersection of file tree and document analysis."""

243 condition_asset_mentions = []

244 for line in in_mem_md_doc:

245 if any(f'{name}/' in line for name in MEDIA_FOLDER_NAMES):

246 condition_asset_mentions.append(line)

247 lines_with_asset_mentions = '\n'.join(condition_asset_mentions)

248

249 media_selection = []

250 for asset_folder_name in MEDIA_FOLDER_NAMES:

251 for path_str in sorted(pathlib.Path(manuscript_path / asset_folder_name).glob('**/*')):

252 asset_path_str = str(pathlib.Path(asset_folder_name, path_str.name)).replace('\\', '/')

253 if asset_path_str in lines_with_asset_mentions: 253 ↛ 251line 253 didn't jump to line 251, because the condition on line 253 was never false

254 media_selection.append(asset_path_str)

255

256 return media_selection

257

258

259def copy_media_assets(manuscript_path: PathType, media_selection: List[str], html_folder: PathType) -> None:

260 """Copy select media assets from the MEDIA_FOLDER_NAMES folders to the render tree."""

261 for asset_folder_name in MEDIA_FOLDER_NAMES:

262 for path_str in sorted(pathlib.Path(manuscript_path / asset_folder_name).glob('**/*')):

263 asset_path_str = str(pathlib.Path(asset_folder_name, path_str.name)).replace('\\', '/')

264 if asset_path_str in media_selection: 264 ↛ 262line 264 didn't jump to line 262, because the condition on line 264 was never false

265 src = manuscript_path / asset_path_str

266 (html_folder / asset_folder_name).mkdir(parents=True, exist_ok=True)

267 dest = html_folder / asset_path_str

268 shutil.copy2(src, dest)

269

270

271def collect_asset_descriptions(media_selection, manuscript_path: PathType): # type: ignore

272 """LATER"""

273 asset_descriptions = []

274 for asset_folder_name in MEDIA_FOLDER_NAMES:

275 for path_str in sorted(pathlib.Path(manuscript_path / asset_folder_name).glob('**/*')):

276 asset_path_str = str(pathlib.Path(asset_folder_name, path_str.name)).replace('\\', '/')

277 if asset_path_str in media_selection: 277 ↛ 275line 277 didn't jump to line 275, because the condition on line 277 was never false

278 src_path = manuscript_path / asset_folder_name / path_str.name

279 a_hash, mod_at, size_bytes = describe_file(src_path)

280 asset_descriptions.append((asset_path_str, a_hash, mod_at, size_bytes))

281 if mod_at is None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true

282 print(f'ERROR: media asset file ({src_path}) is not accessible for report of request?')

283 return 1

284

285 asset_descriptions_reshaped = []

286 for path_str, a_hash, mod_at, size_bytes in asset_descriptions: # type: ignore

287 asset_descriptions_reshaped.append(

288 {

289 'asset_path_str': path_str,

290 'asset_hash_sha256': a_hash,

291 'asset_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT) if mod_at else None,

292 'asset_size_bytes': size_bytes,

293 }

294 )

295

296 return asset_descriptions_reshaped

297

298

299def document_rendering(model: ModelType, collation_folder: PathType, stdout: bool = False) -> None:

300 """Provide necessary and sufficient information linking the renditions with request time information.

301 The timestamps, manuscript, variant, as well as checksums identifying source and target files,

302 """

303 collation_report_path = collation_folder / 'render-info.json'

304

305 report: ModelType = {**model}

306 for key in report.keys():

307 if isinstance(report[key], List) and report[key]:

308 values: List[PathType] = report[key] # type: ignore

309 if isinstance(values[0], PathType):

310 report[key] = [str(value) for value in values]

311 elif isinstance(report[key], pathlib.Path):

312 report[key] = str(report[key])

313

314 if not stdout:

315 with open(collation_report_path, 'wt', encoding=ENCODING) as handle:

316 json.dump(report, handle, indent=2)

317 else:

318 print(json.dumps(report, indent=2))

319

320

321def reader(path: str) -> Iterator[str]:

322 """Context wrapper / generator to read the lines."""

323 with open(pathlib.Path(path), 'rt', encoding=ENCODING) as handle: 323 ↛ exitline 323 didn't return from function 'reader'

324 for line in handle: 324 ↛ 323line 324 didn't jump to line 323

325 yield line

326

327

328def verify_request(argv: Optional[List[str]]) -> Tuple[int, str, List[str]]:

329 """Fail with grace."""

330 if not argv or len(argv) != 3:

331 return 2, 'received wrong number of arguments', ['']

332

333 command, wun, two = argv

334 if command not in ('render' 'verify',): 334 ↛ 337line 334 didn't jump to line 337, because the condition on line 334 was never false

335 return 2, 'received unknown command', ['']

336

337 if command == 'verify':

338 inp, config = wun, two

339

340 if inp:

341 if not pathlib.Path(str(inp)).is_file():

342 return 1, 'source is no file', ['']

343

344 if not config:

345 return 2, 'configuration missing', ['']

346

347 config_path = pathlib.Path(str(config))

348 if not config_path.is_file():

349 return 1, f'config ({config_path}) is no file', ['']

350 if not ''.join(config_path.suffixes).lower().endswith('.json'):

351 return 1, 'config has no .json extension', ['']

352

353 return 0, '', argv

354

355 manuscript, target = wun, two

356

357 if manuscript:

358 if not pathlib.Path(str(manuscript)).is_dir():

359 return 1, 'manuscript is no folder', ['']

360

361 if not target:

362 return 2, 'target missing', ['']

363

364 return 0, '', argv

365

366

367def main(argv: Union[List[str], None] = None) -> int:

368 """Drive the request, discover, rendering, and reporting processes."""

369 argv = sys.argv[1:] if argv is None else argv

370

371 if not argv or not isinstance(argv, list) or len(argv) != 5: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true

372 print('For usage info: render --help')

373 return 2

374

375 processing_start = dti.datetime.now(dti.UTC)

376 root_path = workspace_path()

377 error_code, message, root_path, command, manuscript, variant, render_path = parse_request(root_path, argv)

378 if error_code: 378 ↛ 379line 378 didn't jump to line 379, because the condition on line 378 was never true

379 print(f'ERROR: {message}')

380 return error_code

381 model: ModelType = {

382 'request_parameters': argv,

383 'processing_start': processing_start.strftime(REPORT_TIMESTAMP_FORMAT),

384 'manuscript': manuscript,

385 'variant': variant,

386 }

387 manuscript_path = root_path / manuscript

388 error_code, message, render_config = load_config(manuscript_path, variant)

389 if error_code: 389 ↛ 390line 389 didn't jump to line 390, because the condition on line 389 was never true

390 print(f'ERROR: {message}')

391 return error_code

392 css = render_config.get('css_declarations', '')

393 if css: 393 ↛ 395line 393 didn't jump to line 395, because the condition on line 393 was never false

394 render_config['css_declarations'] = base64.b64encode(css.encode(ENCODING)).decode(ENCODING)

395 a_path = manuscript_path / CONFIG_NAME

396 a_hash, mod_at, size_bytes = describe_file(a_path)

397 if mod_at is None: 397 ↛ 398line 397 didn't jump to line 398, because the condition on line 397 was never true

398 print(f'ERROR: configuration file ({a_path}) is not accessible for report of request?')

399 return 1

400

401 model = {

402 **model,

403 'manuscript_path': manuscript_path,

404 'config_path': a_path,

405 'config_hash_sha256': a_hash,

406 'config_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),

407 'config_size_bytes': size_bytes,

408 'render_config': render_config,

409 }

410

411 error_code, message, binder = load_binder(manuscript_path, variant)

412 if error_code: 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true

413 print(f'ERROR: {message}')

414 return error_code

415 a_path = manuscript_path / f'bind-{variant}.txt'

416 a_hash, mod_at, size_bytes = describe_file(a_path)

417 if mod_at is None: 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true

418 print(f'ERROR: binder definition file ({a_path}) is not accessible for report of request?')

419 return 1

420

421 model = {

422 **model,

423 'binder_path': a_path,

424 'binder_hash_sha256': a_hash,

425 'binder_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),

426 'binder_size_bytes': size_bytes,

427 'binder': binder,

428 }

429

430 print('Binder analysis OK, all files resolve. Sequence of binding will be:')

431 for rank, part in enumerate(binder, start=1):

432 print(f'{rank :>2d}: {part}')

433

434 collation_folder = render_path / 'render' / manuscript / variant

435 collation_folder.mkdir(parents=True, exist_ok=True)

436 collation_name = f'{render_config["name"]}.md'

437 collation_path = collation_folder / collation_name

438 if command == 'verify':

439 document_rendering(model, collation_folder, stdout=True)

440 return 0

441

442 print(f'Binding source documents from ({manuscript}) for target({variant}) to {collation_path} ...')

443 in_mem_md_doc = bind_parts(binder, collation_path)

444 lines_written = len(in_mem_md_doc)

445 print(f'- Written {lines_written} lines from {len(binder)} parts to {collation_path}')

446 a_hash, mod_at, size_bytes = describe_file(collation_path)

447 if mod_at is None: 447 ↛ 448line 447 didn't jump to line 448, because the condition on line 447 was never true

448 print(f'ERROR: collation markdown file ({collation_path}) is not accessible for report of request?')

449 return 1

450

451 model = {

452 **model,

453 'collation_folder': collation_folder,

454 'collation_name': collation_name,

455 'collation_path': collation_path,

456 'collation_hash_sha256': a_hash,

457 'collation_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),

458 'collation_size_bytes': size_bytes,

459 'lines_written': lines_written,

460 }

461

462 html_folder = collation_folder / 'html'

463 html_folder.mkdir(parents=True, exist_ok=True)

464 html_name = f'{render_config["name"]}.html'

465 html_path = html_folder / html_name

466 print(f'Writing HTML rendition from ({manuscript}) for target({variant}) to {html_path} ...')

467

468 print(f'Creating HTML rendition of document({manuscript}) for target({variant}) below {html_folder}/ ...')

469 render_html(collation_path, collation_name, html_path, css=css)

470 a_hash, mod_at, size_bytes = describe_file(html_path)

471 if mod_at is None: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true

472 print(f'ERROR: rendered HTML file ({html_path}) is not accessible for report of request?')

473 return 1

474

475 model = {

476 **model,

477 'html_folder': html_folder,

478 'html_name': html_name,

479 'html_path': html_path,

480 'html_hash_sha256': a_hash,

481 'html_data_version': mod_at.strftime(REPORT_TIMESTAMP_FORMAT),

482 'html_size_bytes': size_bytes,

483 }

484

485 print('Determine set of media assets in use ...')

486 media_selection = extract_media_selection(in_mem_md_doc, manuscript_path)

487

488 print(f'Copying the per conventions {len(MEDIA_FOLDER_NAMES)} media asset folders from source to target ...')

489 copy_media_assets(manuscript_path, media_selection, html_folder)

490

491 model['asset_descriptions'] = collect_asset_descriptions(media_selection, manuscript_path)

492

493 print(f'Done. Entrypoint is {html_path}')

494 processing_stop = dti.datetime.now(dti.UTC)

495 model = {

496 **model,

497 'processing_stop': processing_stop.strftime(REPORT_TIMESTAMP_FORMAT),

498 'processing_duration_seconds': (processing_stop - processing_start).total_seconds(),

499 'render_config': render_config,

500 }

501 document_rendering(model, collation_folder)

502 return 0