Coverage for kohtaaminen/kohtaaminen.py: 74.52%

1# -*- coding: utf-8 -*-

2# pylint: disable=expression-not-assigned,line-too-long

3"""Meeting, rendezvous, confluence (Finnish kohtaaminen) mark up, down, and up again. API."""

4import itertools

5import os

6import pathlib

7import re

8import shutil

9import sys

10import tempfile

11import zipfile

12from typing import List, Optional, Tuple, Union

14import mdformat

15import pypandoc # type: ignore

17DEBUG_VAR = 'KOHTAAMINEN_DEBUG'

18DEBUG = os.getenv(DEBUG_VAR)

20ENCODING = 'utf-8'

21ENCODING_ERRORS_POLICY = 'ignore'

23STDIN, STDOUT = 'STDIN', 'STDOUT'

24DISPATCH = {

25 STDIN: sys.stdin,

26 STDOUT: sys.stdout,

27}

29MD_ROOT = pathlib.Path('kohtaaminen-md')

32def verify_request(argv: Optional[List[str]]) -> Tuple[int, str, List[str]]:

33 """Fail with grace."""

34 if not argv or len(argv) != 2:

35 return 2, 'received wrong number of arguments', ['']

37 command, inp = argv

39 if command not in ('translate'):

40 return 2, 'received unknown command', ['']

42 if inp:

43 in_path = pathlib.Path(str(inp))

44 if not in_path.is_file():

45 return 1, f'source ({in_path}) is no file', ['']

46 if not ''.join(in_path.suffixes).lower().endswith('.zip'):

47 return 1, 'source has not .zip extension', ['']

49 return 0, '', argv

52def filter_index(data: List[str]) -> str:

53 """WIP."""

54 less_noisy_lines = ['# Index']

55 noise_end_token, noise_end_seen = '## Available Pages:', False

56 for line in data:

57 if noise_end_seen:

58 less_noisy_lines.append(line)

59 else:

60 noise_end_seen = line.startswith(noise_end_token)

61 text = '\n'.join(

62 line for line in itertools.takewhile(lambda x: 'Document generated by ' not in x, less_noisy_lines)

63 )

64 text = re.sub(r'\n\s*\n', '\n\n', text) # squeeze space-only and blank lines

65 text = text.lstrip('\n') # no leading new lines

66 text = text.rstrip('\n') + '\n' # single new line at end of file

67 return text

70def filter_leaf(data: List[str], assets: List[str]) -> str:

71 """WIP."""

72 lines = [line for line in data if 'Created by <span class="author"> ' not in line]

73 for ndx, line in enumerate(lines):

74 if '# <span id="title-text">' in line:

75 prefix, title_plus = line.split('# <span id="title-text">', 1)

76 title = title_plus.strip().rstrip('</span>').strip()

77 lines[ndx] = f'{prefix}# {title}'

78 elif line.startswith('``` syntaxhighlighter-pre'): 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 lines[ndx] = '```'

80 elif '"><img src="' in line:

81 later_head, image_tail = line.split('"><img src="', 1)

82 later = f'{later_head}>\n'

83 src, rest = image_tail.split('" ', 1)

84 if src not in assets:

85 assets.append(src)

86 try:

87 _, height_plus = rest.split('data-height="', 1)

88 height, width_plus = height_plus.split('" data-width="', 1)

89 width, _ = width_plus.split('" ', 1)

90 except ValueError as err:

91 print(' ... note: survived image parsing with crash, using defaults. details:', err)

92 height, width, center = '42', '42', False

93 center = 'image-center' in line

94 span_tail = line.endswith('</span>')

95 attributes = f'width:{width}, height:{height}, center:{"true" if center else "false"}'

96 image = f'![]({src}){ {attributes}} '

97 lines[ndx] = later + image

98 if span_tail:

99 lines[ndx] += '\n</span>'

100

101 resplit = []

102 for line in lines:

103 if '\n' not in line: 103 ↛ 106line 103 didn't jump to line 106, because the condition on line 103 was never false

104 resplit.append(line)

105 else:

106 for lin in line.split('\n'):

107 resplit.append(lin)

108 lines = [

109 line

110 for line in resplit

111 if not line.startswith('<span') and not line.startswith('class="') and line.strip() != '</span>'

112 ]

113 lines = [line for line in itertools.takewhile(lambda x: 'Document generated by ' not in x, lines)]

114 text = '\n'.join(line for line in itertools.takewhile(lambda x: not x.startswith('## Attachments:'), lines))

115 text = re.sub(r'\n\s*\n', '\n\n', text) # squeeze space-only and blank lines

116 text = text.lstrip('\n') # no leading new lines

117 text = text.rstrip('\n') + '\n' # single new line at end of file

118 return text

119

120

121def main(argv: Union[List[str], None] = None) -> int:

122 """Drive the translation."""

123 error, message, strings = verify_request(argv)

124 if error:

125 print(message, file=sys.stderr)

126 return error

127

128 command, inp = strings

129 if not zipfile.is_zipfile(inp):

130 print('wrong magic number in zipfile')

131 return 1

132

133 tasks = []

134 with zipfile.ZipFile(inp, 'r') as zipper: 134 ↛ 224line 134 didn't jump to line 224

135 alerts = []

136 print(f'analyzing zip file listing of ({inp})')

137 for name in zipper.namelist():

138 if not name[0].isidentifier() or '..' in name: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true

139 alerts.append(f'suspicious entry ({name}) will be skipped')

140 if alerts: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 print(f'found {len(alerts)} suspicious entries in zip file ({inp}):')

142 for alert in alerts:

143 print(f'- {alert}')

144 # return 1

145

146 asset_source_root = ''

147 assets: List[str] = []

148 with tempfile.TemporaryDirectory() as unpack:

149 print(f'unpacking zip file below ({unpack})')

150 zipper.extractall(path=unpack)

151 print(f'traversing folder ({unpack})')

152 for place in sorted(pathlib.Path(unpack).glob('**')):

153 print(f'* {place}')

154 for thing in sorted(place.iterdir()):

155 if thing.is_dir():

156 if not asset_source_root and thing.name == 'attachments':

157 asset_source_root = str(thing.parent)

158 continue

159 if thing.suffixes[-1] == '.html':

160 tasks.append(thing)

161 print(f' - {thing}')

162

163 out_root = MD_ROOT

164 print(f'translating html tree from ({inp if inp else STDIN}) into markdown tree below {out_root}')

165

166 start = None

167 for task in tasks: 167 ↛ 172line 167 didn't jump to line 172, because the loop on line 167 didn't complete

168 if task.name == 'index.html':

169 start = task

170 break

171

172 for task in tasks:

173 marker = ' *' if task == start else ''

174 print(f'- {task}{marker}')

175

176 if not start: 176 ↛ 177line 176 didn't jump to line 177, because the condition on line 176 was never true

177 print('did not find start target')

178 return 1

179

180 index_path = out_root / 'index.md'

181 index_path.parent.mkdir(parents=True, exist_ok=True)

182 pypandoc.convert_file(str(start), 'gfm', outputfile=str(index_path))

183 with open(index_path, 'rt', encoding=ENCODING) as handle:

184 text = filter_index(

185 [line.rstrip() for line in handle.readlines() if '</div>' not in line and '<div ' not in line]

186 )

187 with open(index_path, 'wt', encoding=ENCODING) as handle:

188 handle.write(text)

189

190 for task in tasks: 190 ↛ 204line 190 didn't jump to line 204, because the loop on line 190 didn't complete

191 if task == start: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true

192 continue

193 task_path = out_root / task.name.replace('html', 'md')

194 pypandoc.convert_file(str(task), 'gfm', outputfile=str(task_path))

195 with open(task_path, 'rt', encoding=ENCODING) as handle:

196 text = filter_leaf(

197 [line.rstrip() for line in handle.readlines() if '</div>' not in line and '<div ' not in line],

198 assets,

199 )

200 with open(task_path, 'wt', encoding=ENCODING) as handle:

201 handle.write(text + '\n')

202

203 # Push the media assets (so the md format does not remove the links)

204 if assets:

205 nr_assets = len(assets)

206 print(f'imported {nr_assets} distinct asset{"" if nr_assets == 1 else "s"}:')

207 for asset in assets:

208 print(f'- {asset}')

209 asset_source = pathlib.Path(asset_source_root) / asset

210 asset_path = out_root / asset

211 asset_path.parent.mkdir(parents=True, exist_ok=True)

212 try:

213 shutil.copyfile(asset_source, asset_path)

214 except FileNotFoundError as err:

215 print(' ... note: survived wrongly parsed file source path on shutil copy. details:', err)

216

217 # Format the markdown

218 for task in tasks:

219 task_path = out_root / task.name.replace('html', 'md')

220 mdformat.file(task_path, options={'number': True, 'wrap': 142})

221

222 print(f'markdown tree is below ({out_root})')

223

224 return 0