Coverage for kohtaaminen/kohtaaminen.py: 74.52%
168 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-04 19:02:00 +00:00
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-04 19:02:00 +00:00
1# -*- coding: utf-8 -*-
2# pylint: disable=expression-not-assigned,line-too-long
3"""Meeting, rendezvous, confluence (Finnish kohtaaminen) mark up, down, and up again. API."""
4import itertools
5import os
6import pathlib
7import re
8import shutil
9import sys
10import tempfile
11import zipfile
12from typing import List, Optional, Tuple, Union
14import mdformat
15import pypandoc # type: ignore
17DEBUG_VAR = 'KOHTAAMINEN_DEBUG'
18DEBUG = os.getenv(DEBUG_VAR)
20ENCODING = 'utf-8'
21ENCODING_ERRORS_POLICY = 'ignore'
23STDIN, STDOUT = 'STDIN', 'STDOUT'
24DISPATCH = {
25 STDIN: sys.stdin,
26 STDOUT: sys.stdout,
27}
29MD_ROOT = pathlib.Path('kohtaaminen-md')
32def verify_request(argv: Optional[List[str]]) -> Tuple[int, str, List[str]]:
33 """Fail with grace."""
34 if not argv or len(argv) != 2:
35 return 2, 'received wrong number of arguments', ['']
37 command, inp = argv
39 if command not in ('translate'):
40 return 2, 'received unknown command', ['']
42 if inp:
43 in_path = pathlib.Path(str(inp))
44 if not in_path.is_file():
45 return 1, f'source ({in_path}) is no file', ['']
46 if not ''.join(in_path.suffixes).lower().endswith('.zip'):
47 return 1, 'source has not .zip extension', ['']
49 return 0, '', argv
52def filter_index(data: List[str]) -> str:
53 """WIP."""
54 less_noisy_lines = ['# Index']
55 noise_end_token, noise_end_seen = '## Available Pages:', False
56 for line in data:
57 if noise_end_seen:
58 less_noisy_lines.append(line)
59 else:
60 noise_end_seen = line.startswith(noise_end_token)
61 text = '\n'.join(
62 line for line in itertools.takewhile(lambda x: 'Document generated by ' not in x, less_noisy_lines)
63 )
64 text = re.sub(r'\n\s*\n', '\n\n', text) # squeeze space-only and blank lines
65 text = text.lstrip('\n') # no leading new lines
66 text = text.rstrip('\n') + '\n' # single new line at end of file
67 return text
70def filter_leaf(data: List[str], assets: List[str]) -> str:
71 """WIP."""
72 lines = [line for line in data if 'Created by <span class="author"> ' not in line]
73 for ndx, line in enumerate(lines):
74 if '# <span id="title-text">' in line:
75 prefix, title_plus = line.split('# <span id="title-text">', 1)
76 title = title_plus.strip().rstrip('</span>').strip()
77 lines[ndx] = f'{prefix}# {title}'
78 elif line.startswith('``` syntaxhighlighter-pre'): 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true
79 lines[ndx] = '```'
80 elif '"><img src="' in line:
81 later_head, image_tail = line.split('"><img src="', 1)
82 later = f'{later_head}>\n'
83 src, rest = image_tail.split('" ', 1)
84 if src not in assets:
85 assets.append(src)
86 try:
87 _, height_plus = rest.split('data-height="', 1)
88 height, width_plus = height_plus.split('" data-width="', 1)
89 width, _ = width_plus.split('" ', 1)
90 except ValueError as err:
91 print(' ... note: survived image parsing with crash, using defaults. details:', err)
92 height, width, center = '42', '42', False
93 center = 'image-center' in line
94 span_tail = line.endswith('</span>')
95 attributes = f'width:{width}, height:{height}, center:{"true" if center else "false"}'
96 image = f'![]({src}){ {attributes}} '
97 lines[ndx] = later + image
98 if span_tail:
99 lines[ndx] += '\n</span>'
101 resplit = []
102 for line in lines:
103 if '\n' not in line: 103 ↛ 106line 103 didn't jump to line 106, because the condition on line 103 was never false
104 resplit.append(line)
105 else:
106 for lin in line.split('\n'):
107 resplit.append(lin)
108 lines = [
109 line
110 for line in resplit
111 if not line.startswith('<span') and not line.startswith('class="') and line.strip() != '</span>'
112 ]
113 lines = [line for line in itertools.takewhile(lambda x: 'Document generated by ' not in x, lines)]
114 text = '\n'.join(line for line in itertools.takewhile(lambda x: not x.startswith('## Attachments:'), lines))
115 text = re.sub(r'\n\s*\n', '\n\n', text) # squeeze space-only and blank lines
116 text = text.lstrip('\n') # no leading new lines
117 text = text.rstrip('\n') + '\n' # single new line at end of file
118 return text
121def main(argv: Union[List[str], None] = None) -> int:
122 """Drive the translation."""
123 error, message, strings = verify_request(argv)
124 if error:
125 print(message, file=sys.stderr)
126 return error
128 command, inp = strings
129 if not zipfile.is_zipfile(inp):
130 print('wrong magic number in zipfile')
131 return 1
133 tasks = []
134 with zipfile.ZipFile(inp, 'r') as zipper: 134 ↛ 224line 134 didn't jump to line 224
135 alerts = []
136 print(f'analyzing zip file listing of ({inp})')
137 for name in zipper.namelist():
138 if not name[0].isidentifier() or '..' in name: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true
139 alerts.append(f'suspicious entry ({name}) will be skipped')
140 if alerts: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 print(f'found {len(alerts)} suspicious entries in zip file ({inp}):')
142 for alert in alerts:
143 print(f'- {alert}')
144 # return 1
146 asset_source_root = ''
147 assets: List[str] = []
148 with tempfile.TemporaryDirectory() as unpack:
149 print(f'unpacking zip file below ({unpack})')
150 zipper.extractall(path=unpack)
151 print(f'traversing folder ({unpack})')
152 for place in sorted(pathlib.Path(unpack).glob('**')):
153 print(f'* {place}')
154 for thing in sorted(place.iterdir()):
155 if thing.is_dir():
156 if not asset_source_root and thing.name == 'attachments':
157 asset_source_root = str(thing.parent)
158 continue
159 if thing.suffixes[-1] == '.html':
160 tasks.append(thing)
161 print(f' - {thing}')
163 out_root = MD_ROOT
164 print(f'translating html tree from ({inp if inp else STDIN}) into markdown tree below {out_root}')
166 start = None
167 for task in tasks: 167 ↛ 172line 167 didn't jump to line 172, because the loop on line 167 didn't complete
168 if task.name == 'index.html':
169 start = task
170 break
172 for task in tasks:
173 marker = ' *' if task == start else ''
174 print(f'- {task}{marker}')
176 if not start: 176 ↛ 177line 176 didn't jump to line 177, because the condition on line 176 was never true
177 print('did not find start target')
178 return 1
180 index_path = out_root / 'index.md'
181 index_path.parent.mkdir(parents=True, exist_ok=True)
182 pypandoc.convert_file(str(start), 'gfm', outputfile=str(index_path))
183 with open(index_path, 'rt', encoding=ENCODING) as handle:
184 text = filter_index(
185 [line.rstrip() for line in handle.readlines() if '</div>' not in line and '<div ' not in line]
186 )
187 with open(index_path, 'wt', encoding=ENCODING) as handle:
188 handle.write(text)
190 for task in tasks: 190 ↛ 204line 190 didn't jump to line 204, because the loop on line 190 didn't complete
191 if task == start: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 continue
193 task_path = out_root / task.name.replace('html', 'md')
194 pypandoc.convert_file(str(task), 'gfm', outputfile=str(task_path))
195 with open(task_path, 'rt', encoding=ENCODING) as handle:
196 text = filter_leaf(
197 [line.rstrip() for line in handle.readlines() if '</div>' not in line and '<div ' not in line],
198 assets,
199 )
200 with open(task_path, 'wt', encoding=ENCODING) as handle:
201 handle.write(text + '\n')
203 # Push the media assets (so the md format does not remove the links)
204 if assets:
205 nr_assets = len(assets)
206 print(f'imported {nr_assets} distinct asset{"" if nr_assets == 1 else "s"}:')
207 for asset in assets:
208 print(f'- {asset}')
209 asset_source = pathlib.Path(asset_source_root) / asset
210 asset_path = out_root / asset
211 asset_path.parent.mkdir(parents=True, exist_ok=True)
212 try:
213 shutil.copyfile(asset_source, asset_path)
214 except FileNotFoundError as err:
215 print(' ... note: survived wrongly parsed file source path on shutil copy. details:', err)
217 # Format the markdown
218 for task in tasks:
219 task_path = out_root / task.name.replace('html', 'md')
220 mdformat.file(task_path, options={'number': True, 'wrap': 142})
222 print(f'markdown tree is below ({out_root})')
224 return 0