Coverage for lumikko/tree_shape.py: 0.00%
287 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-04 20:15:23 +00:00
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-04 20:15:23 +00:00
1#! /usr/bin/env python3
2# pylint: disable=invalid-name,line-too-long
3"""Visit folder tree report mime-type statistics."""
4import collections
5import copy
6import csv
7import datetime as dti
8import hashlib
9import json
10import lzma
11import os
12import pathlib
13import subprocess # nosec B404
14import sys
16from git import Repo
17from git.exc import InvalidGitRepositoryError
19ENCODING = 'utf-8'
20ENCODING_ERRORS_POLICY = 'ignore'
22HASH_POLICY = 'sha256'
23HASH_POLICIES_KNOWN = (HASH_POLICY,)
25PROXY_DB = 'MIME_TYPES_PROXY_DB'
27TS_FORMAT_HR = '%Y-%m-%d %H:%M:%S'
28TS_FORMAT_DB = '%Y%m%dT%H%M%SZ'
29GIGA = 2 << (30 - 1)
30BUFFER_BYTES = 2 << 15
32STORE_ROOT = '.lumikko-store'
33STORE_PATH_ENTER = pathlib.Path(STORE_ROOT, 'enter')
34STORE_PATH_TOMBS = pathlib.Path(STORE_ROOT, 'tombs')
35STORE_PATH_PROXY = pathlib.Path(STORE_ROOT, 'proxy')
37XZ_FILTERS = [{'id': lzma.FILTER_LZMA2, 'preset': 7 | lzma.PRESET_EXTREME}]
38XZ_EXT = '.xz'
41def archive(stream, file_path):
42 """Create .xz files for long term storage."""
43 file_path = pathlib.Path(file_path) # HACK A DID ACK
44 if file_path.suffixes[-1] != XZ_EXT:
45 file_path = file_path.with_suffix(file_path.suffix + XZ_EXT)
46 with lzma.open(file_path, 'w', check=lzma.CHECK_SHA256, filters=XZ_FILTERS) as f:
47 for entry in stream:
48 f.write(entry.encode(encoding=ENCODING, errors=ENCODING_ERRORS_POLICY))
51def load(proxy_db_path):
52 """Load the proxy data as dict."""
53 file_path = pathlib.Path(proxy_db_path) # HACK A DID ACK
54 if file_path.is_dir():
55 file_path = pathlib.Path(sorted(file_path.glob('*'))[-1])
56 print(f'Found latest proxy source as {file_path}')
57 else:
58 print(f'Recieved latest proxy source as {file_path}')
60 if not file_path.is_file():
61 raise FileNotFoundError('bootstap f proxy file reading failed.')
62 if file_path.suffixes[-1] == XZ_EXT:
63 with lzma.open(file_path, mode='rt', encoding=ENCODING, errors=ENCODING_ERRORS_POLICY) as handle:
64 return {row[0]: row[0:] for row in csv.reader(handle, delimiter=',', quotechar='|')}
65 else:
66 with open(file_path, newline='') as handle:
67 return {row[0]: row[0:] for row in csv.reader(handle, delimiter=',', quotechar='|')}
70def by_name(text, hash_length):
71 """Fast and shallow hash rep validity probe."""
72 hash_rep_length, base = hash_length, 16
73 if len(text) != hash_rep_length:
74 return False
75 try:
76 _ = int(text, base)
77 except ValueError:
78 return False
79 return True
82def possible_hash(text, hash_policy=HASH_POLICY):
83 """Fast and shallow hash rep validity probe."""
84 probe = {
85 HASH_POLICY: 64,
86 'sha1': 40,
87 }
88 return by_name(text, probe[hash_policy])
91def naive_timestamp(timestamp=None):
92 """Logging helper."""
93 if timestamp:
94 return timestamp.strftime(TS_FORMAT_HR)
95 return dti.datetime.now().strftime(TS_FORMAT_HR)
98def db_timestamp(timestamp=None):
99 """Logging helper."""
100 if timestamp:
101 return timestamp.strftime(TS_FORMAT_DB)
102 return dti.datetime.now().strftime(TS_FORMAT_DB)
105def spider_tree(base_path):
106 """Visit all elements in the folders below base path and yeld count."""
107 return sum(1 for _ in pathlib.Path(base_path).rglob('**/*'))
110def walk_files(base_path):
111 """Visit the files in the folders below base path in sorted order."""
112 for file_path in sorted(base_path.rglob('**/*')):
113 yield file_path
116def elf_hash(some_bytes: bytes):
117 """The ELF hash (Extremely Lossy Function - also used in ELF format).
118 unsigned long ElfHash(const unsigned char *s) {
119 unsigned long h = 0, high;
120 while (*s) {
121 h = (h << 4) + *s++;
122 if (high = h & 0xF0000000)
123 h ^= high >> 24;
124 h &= ~high;
125 }
126 return h;
127 }
128 """
129 h = 0
130 for s in some_bytes:
131 h = (h << 4) + s
132 high = h & 0xF0000000
133 if high:
134 h ^= high >> 24
135 h &= ~high
136 return h
139def hash_file(path_string):
140 """Yield hashes of implicit algorithm of path."""
141 path = pathlib.Path(path_string)
142 if not path.is_file():
143 raise IOError('path is no file for hashing.')
145 value = hashlib.sha256()
146 with open(path, 'rb') as in_file:
147 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''):
148 value.update(byte_block)
150 return value.hexdigest()
153def count_lines(path_string):
154 """Yield number of newline chars (\\n) of path."""
155 path = pathlib.Path(path_string)
156 if not path.is_file():
157 raise IOError('path is no file for line count.')
159 value = 0
160 with open(path, 'rb') as in_file:
161 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''):
162 value += byte_block.count(b'\n')
164 return value
167def hashes(path_string, algorithms=None):
168 """Yield hashes per algorithms of path."""
169 if algorithms is None:
170 algorithms = {HASH_POLICY: hashlib.sha256}
171 for key in algorithms:
172 if key not in HASH_POLICIES_KNOWN:
173 raise ValueError('hashes received unexpected algorithm key.')
175 path = pathlib.Path(path_string)
176 if not path.is_file():
177 raise IOError('path is no file.')
179 accumulator = {k: f() for k, f in algorithms.items()}
180 with open(path, 'rb') as in_file:
181 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''):
182 for k in algorithms:
183 accumulator[k].update(byte_block)
185 return {k: f.hexdigest() for k, f in accumulator.items()}
188def file_metrics(file_path):
189 """Retrieve the file stats."""
190 return file_path.stat()
193def mime_type(file_path):
194 """Either yield mime type from find command without file name in result or arti/choke"""
195 find_type = ['file', '--mime', file_path]
196 try:
197 output = subprocess.check_output(find_type, stderr=subprocess.STDOUT).decode() # nosec B603
198 if not output.strip().endswith('(No such file or directory)'):
199 return output.strip().split(':', 1)[1].strip()
200 except subprocess.CalledProcessError:
201 pass # for now
202 return 'arti/choke'
205def serialize(storage_hash, line_count, f_stat, fps, file_type):
206 """x""" # TODO(sthagen) round trip has become a mess - fix it
207 size_bytes, c_time, m_time = f_stat.st_size, f_stat.st_ctime, f_stat.st_mtime
208 return f"{','.join((storage_hash, str(line_count), str(size_bytes), str(c_time), str(m_time), fps, file_type))}\n"
211def gen_out_stream(kind):
212 """DRY"""
213 for v in kind.values():
214 yield f"{','.join(v)}\n"
217def derive_fingerprints(algorithms, file_path):
218 fingerprints = hashes(file_path, algorithms)
219 fps = f'{",".join([f"{k}:{v}" for k, v in fingerprints.items()])}'
220 return fps
223def visit_store(at, hash_policy, algorithms, enter, proxy, update):
224 """Here we walk the tree and dispatch."""
225 mime_types = collections.Counter()
226 mime_sizes, mime_lines = {}, {}
227 found_bytes, public, private, non_file = 0, 0, 0, 0
228 for file_path in walk_files(pathlib.Path(at)):
229 if file_path.name.startswith('.') or '/.' in str(file_path):
230 private += 1
231 continue
232 public += 1
233 storage_name = str(file_path)
234 if not file_path.is_file():
235 non_file += 1
236 continue
237 storage_hash = hash_file(file_path)
238 f_stat = file_metrics(file_path)
239 mt = mime_type(file_path)
240 mime, charset = '', ''
241 if ';' in mt:
242 mime, charset = mt.split(';', 1)
243 line_count = None
244 if mime.startswith('text/') or mime.endswith('xml') or mime.endswith('script'):
245 line_count = count_lines(file_path)
246 if mime not in mime_lines:
247 mime_lines[mime] = 0
248 mime_lines[mime] += line_count
250 mime_types.update([mt])
251 if mt not in mime_sizes:
252 mime_sizes[mt] = 0
253 mime_sizes[mt] += f_stat.st_size
255 if storage_name not in proxy:
256 found_bytes += f_stat.st_size
257 enter[storage_name] = (
258 storage_name,
259 str(line_count),
260 str(f_stat.st_size),
261 str(f_stat.st_ctime),
262 str(f_stat.st_mtime),
263 f'sha256:{storage_hash}',
264 mt,
265 )
266 else:
267 update.add(storage_name)
268 return found_bytes, public, private, non_file, mime_types, mime_sizes, mime_lines
271def distribute_changes(enter, leave, keep, proxy, update):
272 entered_bytes, ignored_bytes, updated_bytes, left_bytes = 0, 0, 0, 0
273 for k, v in proxy.items():
274 if k in update:
275 ignored_bytes += int(v[2])
276 keep[k] = copy.deepcopy(v)
277 else:
278 left_bytes += int(v[2])
279 leave[k] = copy.deepcopy(v)
280 updated_bytes += ignored_bytes
281 for k, v in enter.items():
282 entered_bytes += int(v[2])
283 keep[k] = copy.deepcopy(v)
284 updated_bytes += entered_bytes
285 return entered_bytes, ignored_bytes, left_bytes, updated_bytes
288def derive_proxy_paths(start_ts):
289 added_db = pathlib.Path(STORE_PATH_ENTER, f'added-{db_timestamp(start_ts)}.csv')
290 proxy_db = pathlib.Path(STORE_PATH_PROXY, f'proxy-{db_timestamp(start_ts)}.csv')
291 gone_db = pathlib.Path(STORE_PATH_TOMBS, f'gone-{db_timestamp(start_ts)}.csv')
292 pathlib.Path(STORE_PATH_ENTER).mkdir(parents=True, exist_ok=True)
293 pathlib.Path(STORE_PATH_PROXY).mkdir(parents=True, exist_ok=True)
294 pathlib.Path(STORE_PATH_TOMBS).mkdir(parents=True, exist_ok=True)
295 return added_db, gone_db, proxy_db
298def process(tree_root):
299 """Drive the tree visitor."""
301 hash_policy = HASH_POLICY
302 proxy_db = os.getenv(PROXY_DB, '')
303 proxy_db_path = proxy_db if proxy_db else STORE_PATH_PROXY
304 report = {
305 'pwd': str(pathlib.Path.cwd()),
306 'proxy_db_path': str(proxy_db_path),
307 'store_root': str(STORE_ROOT),
308 }
309 try:
310 proxy = load(proxy_db_path)
311 except FileNotFoundError:
312 proxy = {}
313 print(
314 f'Warning: Initializing proxy databases below {STORE_ROOT} as no path given per {PROXY_DB} or load failed'
315 )
317 previous = len(proxy)
318 enter, update, leave = {}, set(), {}
319 at_or_below = tree_root
321 try:
322 repo = Repo(at_or_below)
323 except InvalidGitRepositoryError:
324 print(f'Warning: Tree {at_or_below} is not under version control')
325 repo = None
326 branch_name = None
327 revision = None
328 revision_date = None
329 origin_url = None
330 if repo:
331 try:
332 origin_url = repo.remotes.origin.url
333 except AttributeError:
334 print(f'Warning: Repository {at_or_below} has no remote')
335 pass
336 branch_name = repo.active_branch.name
337 try:
338 revision = repo.head.commit
339 revision_date = naive_timestamp(dti.datetime.utcfromtimestamp(revision.committed_date))
340 except ValueError:
341 pass
343 report['origin_url'] = origin_url
344 report['branch_name'] = branch_name
345 report['revision_hash'] = str(revision) if revision else None
346 report['revision_date'] = revision_date
347 report['previous'] = str(previous)
348 report['at_or_below'] = str(at_or_below)
350 algorithms = None
351 if hash_policy != HASH_POLICY:
352 algorithms = {
353 HASH_POLICY: hashlib.sha256,
354 }
355 print(f'Warning: Store seems to not use ({HASH_POLICY}) - using ({HASH_POLICY})')
357 start_ts = dti.datetime.now()
358 report['spider_start'] = naive_timestamp(start_ts)
359 stream_size = spider_tree(at_or_below)
360 report['visitor_start'] = naive_timestamp()
361 found_bytes, public, private, non_file, mt_c, mt_s, mt_l = visit_store(
362 at_or_below, hash_policy, algorithms, enter, proxy, update
363 )
365 keep = {}
366 entered_bytes, ignored_bytes, left_bytes, updated_bytes = distribute_changes(enter, leave, keep, proxy, update)
368 added_db, gone_db, proxy_db = derive_proxy_paths(start_ts)
370 report['mime_file_counter'] = str(pathlib.Path(STORE_ROOT, 'mime-counts.json'))
371 report['mime_size_bytes'] = str(pathlib.Path(STORE_ROOT, 'mime-sizes-bytes.json'))
372 report['mime_line_counts'] = str(pathlib.Path(STORE_ROOT, 'mime-line-counts.json'))
374 with open(report['mime_file_counter'], 'wt', encoding=ENCODING) as handle:
375 json.dump(dict(mt_c), handle, indent=2, sort_keys=True)
377 with open(report['mime_size_bytes'], 'wt', encoding=ENCODING) as handle:
378 json.dump(mt_s, handle, indent=2, sort_keys=True)
380 with open(report['mime_line_counts'], 'wt', encoding=ENCODING) as handle:
381 json.dump(mt_l, handle, indent=2, sort_keys=True)
383 entered, updated, left = len(enter), len(keep), len(leave)
384 ignored = public - entered
385 for db, kind in ((added_db, enter), (proxy_db, keep), (gone_db, leave)):
386 archive(gen_out_stream(kind), db)
388 report['entered'] = entered
389 report['entered_bytes'] = entered_bytes
390 report['added_db'] = str(added_db)
391 report['ignored'] = ignored
392 report['ignored_bytes'] = ignored_bytes
393 report['updated'] = updated
394 report['updated_bytes'] = updated_bytes
395 report['proxy_db'] = str(proxy_db)
396 report['left'] = left
397 report['left_bytes'] = left_bytes
398 report['gone_db'] = str(gone_db)
399 report['found_bytes'] = found_bytes
400 report['stream_size'] = stream_size
401 report['private'] = private
402 report['public'] = public
403 report['non_file'] = non_file
404 report['typer_stop'] = naive_timestamp()
406 with open(pathlib.Path('report.json'), 'wt', encoding=ENCODING) as handle:
407 json.dump(report, handle, indent=2, sort_keys=True)
408 return 0
411def main(argv=None):
412 """LALALA."""
413 argv = argv if argv else sys.argv[1:]
414 if len(argv) != 1:
415 print('ERROR tree root argument expected.', file=sys.stderr)
416 return 2
417 tree_root = argv[0].strip()
418 return process(tree_root)
421if __name__ == '__main__':
422 sys.exit(main(sys.argv[1:]))