Coverage for lumikko/tree_shape.py: 0.00%

287 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-04 20:15:23 +00:00

1#! /usr/bin/env python3 

2# pylint: disable=invalid-name,line-too-long 

3"""Visit folder tree report mime-type statistics.""" 

4import collections 

5import copy 

6import csv 

7import datetime as dti 

8import hashlib 

9import json 

10import lzma 

11import os 

12import pathlib 

13import subprocess # nosec B404 

14import sys 

15 

16from git import Repo 

17from git.exc import InvalidGitRepositoryError 

18 

19ENCODING = 'utf-8' 

20ENCODING_ERRORS_POLICY = 'ignore' 

21 

22HASH_POLICY = 'sha256' 

23HASH_POLICIES_KNOWN = (HASH_POLICY,) 

24 

25PROXY_DB = 'MIME_TYPES_PROXY_DB' 

26 

27TS_FORMAT_HR = '%Y-%m-%d %H:%M:%S' 

28TS_FORMAT_DB = '%Y%m%dT%H%M%SZ' 

29GIGA = 2 << (30 - 1) 

30BUFFER_BYTES = 2 << 15 

31 

32STORE_ROOT = '.lumikko-store' 

33STORE_PATH_ENTER = pathlib.Path(STORE_ROOT, 'enter') 

34STORE_PATH_TOMBS = pathlib.Path(STORE_ROOT, 'tombs') 

35STORE_PATH_PROXY = pathlib.Path(STORE_ROOT, 'proxy') 

36 

37XZ_FILTERS = [{'id': lzma.FILTER_LZMA2, 'preset': 7 | lzma.PRESET_EXTREME}] 

38XZ_EXT = '.xz' 

39 

40 

41def archive(stream, file_path): 

42 """Create .xz files for long term storage.""" 

43 file_path = pathlib.Path(file_path) # HACK A DID ACK 

44 if file_path.suffixes[-1] != XZ_EXT: 

45 file_path = file_path.with_suffix(file_path.suffix + XZ_EXT) 

46 with lzma.open(file_path, 'w', check=lzma.CHECK_SHA256, filters=XZ_FILTERS) as f: 

47 for entry in stream: 

48 f.write(entry.encode(encoding=ENCODING, errors=ENCODING_ERRORS_POLICY)) 

49 

50 

51def load(proxy_db_path): 

52 """Load the proxy data as dict.""" 

53 file_path = pathlib.Path(proxy_db_path) # HACK A DID ACK 

54 if file_path.is_dir(): 

55 file_path = pathlib.Path(sorted(file_path.glob('*'))[-1]) 

56 print(f'Found latest proxy source as {file_path}') 

57 else: 

58 print(f'Recieved latest proxy source as {file_path}') 

59 

60 if not file_path.is_file(): 

61 raise FileNotFoundError('bootstap f proxy file reading failed.') 

62 if file_path.suffixes[-1] == XZ_EXT: 

63 with lzma.open(file_path, mode='rt', encoding=ENCODING, errors=ENCODING_ERRORS_POLICY) as handle: 

64 return {row[0]: row[0:] for row in csv.reader(handle, delimiter=',', quotechar='|')} 

65 else: 

66 with open(file_path, newline='') as handle: 

67 return {row[0]: row[0:] for row in csv.reader(handle, delimiter=',', quotechar='|')} 

68 

69 

70def by_name(text, hash_length): 

71 """Fast and shallow hash rep validity probe.""" 

72 hash_rep_length, base = hash_length, 16 

73 if len(text) != hash_rep_length: 

74 return False 

75 try: 

76 _ = int(text, base) 

77 except ValueError: 

78 return False 

79 return True 

80 

81 

82def possible_hash(text, hash_policy=HASH_POLICY): 

83 """Fast and shallow hash rep validity probe.""" 

84 probe = { 

85 HASH_POLICY: 64, 

86 'sha1': 40, 

87 } 

88 return by_name(text, probe[hash_policy]) 

89 

90 

91def naive_timestamp(timestamp=None): 

92 """Logging helper.""" 

93 if timestamp: 

94 return timestamp.strftime(TS_FORMAT_HR) 

95 return dti.datetime.now().strftime(TS_FORMAT_HR) 

96 

97 

98def db_timestamp(timestamp=None): 

99 """Logging helper.""" 

100 if timestamp: 

101 return timestamp.strftime(TS_FORMAT_DB) 

102 return dti.datetime.now().strftime(TS_FORMAT_DB) 

103 

104 

105def spider_tree(base_path): 

106 """Visit all elements in the folders below base path and yeld count.""" 

107 return sum(1 for _ in pathlib.Path(base_path).rglob('**/*')) 

108 

109 

110def walk_files(base_path): 

111 """Visit the files in the folders below base path in sorted order.""" 

112 for file_path in sorted(base_path.rglob('**/*')): 

113 yield file_path 

114 

115 

116def elf_hash(some_bytes: bytes): 

117 """The ELF hash (Extremely Lossy Function - also used in ELF format). 

118 unsigned long ElfHash(const unsigned char *s) { 

119 unsigned long h = 0, high; 

120 while (*s) { 

121 h = (h << 4) + *s++; 

122 if (high = h & 0xF0000000) 

123 h ^= high >> 24; 

124 h &= ~high; 

125 } 

126 return h; 

127 } 

128 """ 

129 h = 0 

130 for s in some_bytes: 

131 h = (h << 4) + s 

132 high = h & 0xF0000000 

133 if high: 

134 h ^= high >> 24 

135 h &= ~high 

136 return h 

137 

138 

139def hash_file(path_string): 

140 """Yield hashes of implicit algorithm of path.""" 

141 path = pathlib.Path(path_string) 

142 if not path.is_file(): 

143 raise IOError('path is no file for hashing.') 

144 

145 value = hashlib.sha256() 

146 with open(path, 'rb') as in_file: 

147 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''): 

148 value.update(byte_block) 

149 

150 return value.hexdigest() 

151 

152 

153def count_lines(path_string): 

154 """Yield number of newline chars (\\n) of path.""" 

155 path = pathlib.Path(path_string) 

156 if not path.is_file(): 

157 raise IOError('path is no file for line count.') 

158 

159 value = 0 

160 with open(path, 'rb') as in_file: 

161 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''): 

162 value += byte_block.count(b'\n') 

163 

164 return value 

165 

166 

167def hashes(path_string, algorithms=None): 

168 """Yield hashes per algorithms of path.""" 

169 if algorithms is None: 

170 algorithms = {HASH_POLICY: hashlib.sha256} 

171 for key in algorithms: 

172 if key not in HASH_POLICIES_KNOWN: 

173 raise ValueError('hashes received unexpected algorithm key.') 

174 

175 path = pathlib.Path(path_string) 

176 if not path.is_file(): 

177 raise IOError('path is no file.') 

178 

179 accumulator = {k: f() for k, f in algorithms.items()} 

180 with open(path, 'rb') as in_file: 

181 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''): 

182 for k in algorithms: 

183 accumulator[k].update(byte_block) 

184 

185 return {k: f.hexdigest() for k, f in accumulator.items()} 

186 

187 

188def file_metrics(file_path): 

189 """Retrieve the file stats.""" 

190 return file_path.stat() 

191 

192 

193def mime_type(file_path): 

194 """Either yield mime type from find command without file name in result or arti/choke""" 

195 find_type = ['file', '--mime', file_path] 

196 try: 

197 output = subprocess.check_output(find_type, stderr=subprocess.STDOUT).decode() # nosec B603 

198 if not output.strip().endswith('(No such file or directory)'): 

199 return output.strip().split(':', 1)[1].strip() 

200 except subprocess.CalledProcessError: 

201 pass # for now 

202 return 'arti/choke' 

203 

204 

205def serialize(storage_hash, line_count, f_stat, fps, file_type): 

206 """x""" # TODO(sthagen) round trip has become a mess - fix it 

207 size_bytes, c_time, m_time = f_stat.st_size, f_stat.st_ctime, f_stat.st_mtime 

208 return f"{','.join((storage_hash, str(line_count), str(size_bytes), str(c_time), str(m_time), fps, file_type))}\n" 

209 

210 

211def gen_out_stream(kind): 

212 """DRY""" 

213 for v in kind.values(): 

214 yield f"{','.join(v)}\n" 

215 

216 

217def derive_fingerprints(algorithms, file_path): 

218 fingerprints = hashes(file_path, algorithms) 

219 fps = f'{",".join([f"{k}:{v}" for k, v in fingerprints.items()])}' 

220 return fps 

221 

222 

223def visit_store(at, hash_policy, algorithms, enter, proxy, update): 

224 """Here we walk the tree and dispatch.""" 

225 mime_types = collections.Counter() 

226 mime_sizes, mime_lines = {}, {} 

227 found_bytes, public, private, non_file = 0, 0, 0, 0 

228 for file_path in walk_files(pathlib.Path(at)): 

229 if file_path.name.startswith('.') or '/.' in str(file_path): 

230 private += 1 

231 continue 

232 public += 1 

233 storage_name = str(file_path) 

234 if not file_path.is_file(): 

235 non_file += 1 

236 continue 

237 storage_hash = hash_file(file_path) 

238 f_stat = file_metrics(file_path) 

239 mt = mime_type(file_path) 

240 mime, charset = '', '' 

241 if ';' in mt: 

242 mime, charset = mt.split(';', 1) 

243 line_count = None 

244 if mime.startswith('text/') or mime.endswith('xml') or mime.endswith('script'): 

245 line_count = count_lines(file_path) 

246 if mime not in mime_lines: 

247 mime_lines[mime] = 0 

248 mime_lines[mime] += line_count 

249 

250 mime_types.update([mt]) 

251 if mt not in mime_sizes: 

252 mime_sizes[mt] = 0 

253 mime_sizes[mt] += f_stat.st_size 

254 

255 if storage_name not in proxy: 

256 found_bytes += f_stat.st_size 

257 enter[storage_name] = ( 

258 storage_name, 

259 str(line_count), 

260 str(f_stat.st_size), 

261 str(f_stat.st_ctime), 

262 str(f_stat.st_mtime), 

263 f'sha256:{storage_hash}', 

264 mt, 

265 ) 

266 else: 

267 update.add(storage_name) 

268 return found_bytes, public, private, non_file, mime_types, mime_sizes, mime_lines 

269 

270 

271def distribute_changes(enter, leave, keep, proxy, update): 

272 entered_bytes, ignored_bytes, updated_bytes, left_bytes = 0, 0, 0, 0 

273 for k, v in proxy.items(): 

274 if k in update: 

275 ignored_bytes += int(v[2]) 

276 keep[k] = copy.deepcopy(v) 

277 else: 

278 left_bytes += int(v[2]) 

279 leave[k] = copy.deepcopy(v) 

280 updated_bytes += ignored_bytes 

281 for k, v in enter.items(): 

282 entered_bytes += int(v[2]) 

283 keep[k] = copy.deepcopy(v) 

284 updated_bytes += entered_bytes 

285 return entered_bytes, ignored_bytes, left_bytes, updated_bytes 

286 

287 

288def derive_proxy_paths(start_ts): 

289 added_db = pathlib.Path(STORE_PATH_ENTER, f'added-{db_timestamp(start_ts)}.csv') 

290 proxy_db = pathlib.Path(STORE_PATH_PROXY, f'proxy-{db_timestamp(start_ts)}.csv') 

291 gone_db = pathlib.Path(STORE_PATH_TOMBS, f'gone-{db_timestamp(start_ts)}.csv') 

292 pathlib.Path(STORE_PATH_ENTER).mkdir(parents=True, exist_ok=True) 

293 pathlib.Path(STORE_PATH_PROXY).mkdir(parents=True, exist_ok=True) 

294 pathlib.Path(STORE_PATH_TOMBS).mkdir(parents=True, exist_ok=True) 

295 return added_db, gone_db, proxy_db 

296 

297 

298def process(tree_root): 

299 """Drive the tree visitor.""" 

300 

301 hash_policy = HASH_POLICY 

302 proxy_db = os.getenv(PROXY_DB, '') 

303 proxy_db_path = proxy_db if proxy_db else STORE_PATH_PROXY 

304 report = { 

305 'pwd': str(pathlib.Path.cwd()), 

306 'proxy_db_path': str(proxy_db_path), 

307 'store_root': str(STORE_ROOT), 

308 } 

309 try: 

310 proxy = load(proxy_db_path) 

311 except FileNotFoundError: 

312 proxy = {} 

313 print( 

314 f'Warning: Initializing proxy databases below {STORE_ROOT} as no path given per {PROXY_DB} or load failed' 

315 ) 

316 

317 previous = len(proxy) 

318 enter, update, leave = {}, set(), {} 

319 at_or_below = tree_root 

320 

321 try: 

322 repo = Repo(at_or_below) 

323 except InvalidGitRepositoryError: 

324 print(f'Warning: Tree {at_or_below} is not under version control') 

325 repo = None 

326 branch_name = None 

327 revision = None 

328 revision_date = None 

329 origin_url = None 

330 if repo: 

331 try: 

332 origin_url = repo.remotes.origin.url 

333 except AttributeError: 

334 print(f'Warning: Repository {at_or_below} has no remote') 

335 pass 

336 branch_name = repo.active_branch.name 

337 try: 

338 revision = repo.head.commit 

339 revision_date = naive_timestamp(dti.datetime.utcfromtimestamp(revision.committed_date)) 

340 except ValueError: 

341 pass 

342 

343 report['origin_url'] = origin_url 

344 report['branch_name'] = branch_name 

345 report['revision_hash'] = str(revision) if revision else None 

346 report['revision_date'] = revision_date 

347 report['previous'] = str(previous) 

348 report['at_or_below'] = str(at_or_below) 

349 

350 algorithms = None 

351 if hash_policy != HASH_POLICY: 

352 algorithms = { 

353 HASH_POLICY: hashlib.sha256, 

354 } 

355 print(f'Warning: Store seems to not use ({HASH_POLICY}) - using ({HASH_POLICY})') 

356 

357 start_ts = dti.datetime.now() 

358 report['spider_start'] = naive_timestamp(start_ts) 

359 stream_size = spider_tree(at_or_below) 

360 report['visitor_start'] = naive_timestamp() 

361 found_bytes, public, private, non_file, mt_c, mt_s, mt_l = visit_store( 

362 at_or_below, hash_policy, algorithms, enter, proxy, update 

363 ) 

364 

365 keep = {} 

366 entered_bytes, ignored_bytes, left_bytes, updated_bytes = distribute_changes(enter, leave, keep, proxy, update) 

367 

368 added_db, gone_db, proxy_db = derive_proxy_paths(start_ts) 

369 

370 report['mime_file_counter'] = str(pathlib.Path(STORE_ROOT, 'mime-counts.json')) 

371 report['mime_size_bytes'] = str(pathlib.Path(STORE_ROOT, 'mime-sizes-bytes.json')) 

372 report['mime_line_counts'] = str(pathlib.Path(STORE_ROOT, 'mime-line-counts.json')) 

373 

374 with open(report['mime_file_counter'], 'wt', encoding=ENCODING) as handle: 

375 json.dump(dict(mt_c), handle, indent=2, sort_keys=True) 

376 

377 with open(report['mime_size_bytes'], 'wt', encoding=ENCODING) as handle: 

378 json.dump(mt_s, handle, indent=2, sort_keys=True) 

379 

380 with open(report['mime_line_counts'], 'wt', encoding=ENCODING) as handle: 

381 json.dump(mt_l, handle, indent=2, sort_keys=True) 

382 

383 entered, updated, left = len(enter), len(keep), len(leave) 

384 ignored = public - entered 

385 for db, kind in ((added_db, enter), (proxy_db, keep), (gone_db, leave)): 

386 archive(gen_out_stream(kind), db) 

387 

388 report['entered'] = entered 

389 report['entered_bytes'] = entered_bytes 

390 report['added_db'] = str(added_db) 

391 report['ignored'] = ignored 

392 report['ignored_bytes'] = ignored_bytes 

393 report['updated'] = updated 

394 report['updated_bytes'] = updated_bytes 

395 report['proxy_db'] = str(proxy_db) 

396 report['left'] = left 

397 report['left_bytes'] = left_bytes 

398 report['gone_db'] = str(gone_db) 

399 report['found_bytes'] = found_bytes 

400 report['stream_size'] = stream_size 

401 report['private'] = private 

402 report['public'] = public 

403 report['non_file'] = non_file 

404 report['typer_stop'] = naive_timestamp() 

405 

406 with open(pathlib.Path('report.json'), 'wt', encoding=ENCODING) as handle: 

407 json.dump(report, handle, indent=2, sort_keys=True) 

408 return 0 

409 

410 

411def main(argv=None): 

412 """LALALA.""" 

413 argv = argv if argv else sys.argv[1:] 

414 if len(argv) != 1: 

415 print('ERROR tree root argument expected.', file=sys.stderr) 

416 return 2 

417 tree_root = argv[0].strip() 

418 return process(tree_root) 

419 

420 

421if __name__ == '__main__': 

422 sys.exit(main(sys.argv[1:]))