Coverage for kohtaaminen/kohtaaminen.py: 74.52%

168 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-04 19:02:00 +00:00

1# -*- coding: utf-8 -*- 

2# pylint: disable=expression-not-assigned,line-too-long 

3"""Meeting, rendezvous, confluence (Finnish kohtaaminen) mark up, down, and up again. API.""" 

4import itertools 

5import os 

6import pathlib 

7import re 

8import shutil 

9import sys 

10import tempfile 

11import zipfile 

12from typing import List, Optional, Tuple, Union 

13 

14import mdformat 

15import pypandoc # type: ignore 

16 

17DEBUG_VAR = 'KOHTAAMINEN_DEBUG' 

18DEBUG = os.getenv(DEBUG_VAR) 

19 

20ENCODING = 'utf-8' 

21ENCODING_ERRORS_POLICY = 'ignore' 

22 

23STDIN, STDOUT = 'STDIN', 'STDOUT' 

24DISPATCH = { 

25 STDIN: sys.stdin, 

26 STDOUT: sys.stdout, 

27} 

28 

29MD_ROOT = pathlib.Path('kohtaaminen-md') 

30 

31 

32def verify_request(argv: Optional[List[str]]) -> Tuple[int, str, List[str]]: 

33 """Fail with grace.""" 

34 if not argv or len(argv) != 2: 

35 return 2, 'received wrong number of arguments', [''] 

36 

37 command, inp = argv 

38 

39 if command not in ('translate'): 

40 return 2, 'received unknown command', [''] 

41 

42 if inp: 

43 in_path = pathlib.Path(str(inp)) 

44 if not in_path.is_file(): 

45 return 1, f'source ({in_path}) is no file', [''] 

46 if not ''.join(in_path.suffixes).lower().endswith('.zip'): 

47 return 1, 'source has not .zip extension', [''] 

48 

49 return 0, '', argv 

50 

51 

52def filter_index(data: List[str]) -> str: 

53 """WIP.""" 

54 less_noisy_lines = ['# Index'] 

55 noise_end_token, noise_end_seen = '## Available Pages:', False 

56 for line in data: 

57 if noise_end_seen: 

58 less_noisy_lines.append(line) 

59 else: 

60 noise_end_seen = line.startswith(noise_end_token) 

61 text = '\n'.join( 

62 line for line in itertools.takewhile(lambda x: 'Document generated by ' not in x, less_noisy_lines) 

63 ) 

64 text = re.sub(r'\n\s*\n', '\n\n', text) # squeeze space-only and blank lines 

65 text = text.lstrip('\n') # no leading new lines 

66 text = text.rstrip('\n') + '\n' # single new line at end of file 

67 return text 

68 

69 

70def filter_leaf(data: List[str], assets: List[str]) -> str: 

71 """WIP.""" 

72 lines = [line for line in data if 'Created by <span class="author"> ' not in line] 

73 for ndx, line in enumerate(lines): 

74 if '# <span id="title-text">' in line: 

75 prefix, title_plus = line.split('# <span id="title-text">', 1) 

76 title = title_plus.strip().rstrip('</span>').strip() 

77 lines[ndx] = f'{prefix}# {title}' 

78 elif line.startswith('``` syntaxhighlighter-pre'): 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 lines[ndx] = '```' 

80 elif '"><img src="' in line: 

81 later_head, image_tail = line.split('"><img src="', 1) 

82 later = f'{later_head}>\n' 

83 src, rest = image_tail.split('" ', 1) 

84 if src not in assets: 

85 assets.append(src) 

86 try: 

87 _, height_plus = rest.split('data-height="', 1) 

88 height, width_plus = height_plus.split('" data-width="', 1) 

89 width, _ = width_plus.split('" ', 1) 

90 except ValueError as err: 

91 print(' ... note: survived image parsing with crash, using defaults. details:', err) 

92 height, width, center = '42', '42', False 

93 center = 'image-center' in line 

94 span_tail = line.endswith('</span>') 

95 attributes = f'width:{width}, height:{height}, center:{"true" if center else "false"}' 

96 image = f'![]({src}){ {attributes}} ' 

97 lines[ndx] = later + image 

98 if span_tail: 

99 lines[ndx] += '\n</span>' 

100 

101 resplit = [] 

102 for line in lines: 

103 if '\n' not in line: 103 ↛ 106line 103 didn't jump to line 106, because the condition on line 103 was never false

104 resplit.append(line) 

105 else: 

106 for lin in line.split('\n'): 

107 resplit.append(lin) 

108 lines = [ 

109 line 

110 for line in resplit 

111 if not line.startswith('<span') and not line.startswith('class="') and line.strip() != '</span>' 

112 ] 

113 lines = [line for line in itertools.takewhile(lambda x: 'Document generated by ' not in x, lines)] 

114 text = '\n'.join(line for line in itertools.takewhile(lambda x: not x.startswith('## Attachments:'), lines)) 

115 text = re.sub(r'\n\s*\n', '\n\n', text) # squeeze space-only and blank lines 

116 text = text.lstrip('\n') # no leading new lines 

117 text = text.rstrip('\n') + '\n' # single new line at end of file 

118 return text 

119 

120 

121def main(argv: Union[List[str], None] = None) -> int: 

122 """Drive the translation.""" 

123 error, message, strings = verify_request(argv) 

124 if error: 

125 print(message, file=sys.stderr) 

126 return error 

127 

128 command, inp = strings 

129 if not zipfile.is_zipfile(inp): 

130 print('wrong magic number in zipfile') 

131 return 1 

132 

133 tasks = [] 

134 with zipfile.ZipFile(inp, 'r') as zipper: 134 ↛ 224line 134 didn't jump to line 224

135 alerts = [] 

136 print(f'analyzing zip file listing of ({inp})') 

137 for name in zipper.namelist(): 

138 if not name[0].isidentifier() or '..' in name: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true

139 alerts.append(f'suspicious entry ({name}) will be skipped') 

140 if alerts: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 print(f'found {len(alerts)} suspicious entries in zip file ({inp}):') 

142 for alert in alerts: 

143 print(f'- {alert}') 

144 # return 1 

145 

146 asset_source_root = '' 

147 assets: List[str] = [] 

148 with tempfile.TemporaryDirectory() as unpack: 

149 print(f'unpacking zip file below ({unpack})') 

150 zipper.extractall(path=unpack) 

151 print(f'traversing folder ({unpack})') 

152 for place in sorted(pathlib.Path(unpack).glob('**')): 

153 print(f'* {place}') 

154 for thing in sorted(place.iterdir()): 

155 if thing.is_dir(): 

156 if not asset_source_root and thing.name == 'attachments': 

157 asset_source_root = str(thing.parent) 

158 continue 

159 if thing.suffixes[-1] == '.html': 

160 tasks.append(thing) 

161 print(f' - {thing}') 

162 

163 out_root = MD_ROOT 

164 print(f'translating html tree from ({inp if inp else STDIN}) into markdown tree below {out_root}') 

165 

166 start = None 

167 for task in tasks: 167 ↛ 172line 167 didn't jump to line 172, because the loop on line 167 didn't complete

168 if task.name == 'index.html': 

169 start = task 

170 break 

171 

172 for task in tasks: 

173 marker = ' *' if task == start else '' 

174 print(f'- {task}{marker}') 

175 

176 if not start: 176 ↛ 177line 176 didn't jump to line 177, because the condition on line 176 was never true

177 print('did not find start target') 

178 return 1 

179 

180 index_path = out_root / 'index.md' 

181 index_path.parent.mkdir(parents=True, exist_ok=True) 

182 pypandoc.convert_file(str(start), 'gfm', outputfile=str(index_path)) 

183 with open(index_path, 'rt', encoding=ENCODING) as handle: 

184 text = filter_index( 

185 [line.rstrip() for line in handle.readlines() if '</div>' not in line and '<div ' not in line] 

186 ) 

187 with open(index_path, 'wt', encoding=ENCODING) as handle: 

188 handle.write(text) 

189 

190 for task in tasks: 190 ↛ 204line 190 didn't jump to line 204, because the loop on line 190 didn't complete

191 if task == start: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true

192 continue 

193 task_path = out_root / task.name.replace('html', 'md') 

194 pypandoc.convert_file(str(task), 'gfm', outputfile=str(task_path)) 

195 with open(task_path, 'rt', encoding=ENCODING) as handle: 

196 text = filter_leaf( 

197 [line.rstrip() for line in handle.readlines() if '</div>' not in line and '<div ' not in line], 

198 assets, 

199 ) 

200 with open(task_path, 'wt', encoding=ENCODING) as handle: 

201 handle.write(text + '\n') 

202 

203 # Push the media assets (so the md format does not remove the links) 

204 if assets: 

205 nr_assets = len(assets) 

206 print(f'imported {nr_assets} distinct asset{"" if nr_assets == 1 else "s"}:') 

207 for asset in assets: 

208 print(f'- {asset}') 

209 asset_source = pathlib.Path(asset_source_root) / asset 

210 asset_path = out_root / asset 

211 asset_path.parent.mkdir(parents=True, exist_ok=True) 

212 try: 

213 shutil.copyfile(asset_source, asset_path) 

214 except FileNotFoundError as err: 

215 print(' ... note: survived wrongly parsed file source path on shutil copy. details:', err) 

216 

217 # Format the markdown 

218 for task in tasks: 

219 task_path = out_root / task.name.replace('html', 'md') 

220 mdformat.file(task_path, options={'number': True, 'wrap': 142}) 

221 

222 print(f'markdown tree is below ({out_root})') 

223 

224 return 0