Coverage for lumikko/tree

1#! /usr/bin/env python3

2# pylint: disable=invalid-name,line-too-long

3"""Visit folder tree report mime-type statistics."""

4import collections

5import copy

6import csv

7import datetime as dti

8import hashlib

9import json

10import lzma

11import os

12import pathlib

13import subprocess # nosec B404

14import sys

16from git import Repo

17from git.exc import InvalidGitRepositoryError

19ENCODING = 'utf-8'

20ENCODING_ERRORS_POLICY = 'ignore'

22HASH_POLICY = 'sha256'

23HASH_POLICIES_KNOWN = (HASH_POLICY,)

25PROXY_DB = 'MIME_TYPES_PROXY_DB'

27TS_FORMAT_HR = '%Y-%m-%d %H:%M:%S'

28TS_FORMAT_DB = '%Y%m%dT%H%M%SZ'

29GIGA = 2 << (30 - 1)

30BUFFER_BYTES = 2 << 15

32STORE_ROOT = '.lumikko-store'

33STORE_PATH_ENTER = pathlib.Path(STORE_ROOT, 'enter')

34STORE_PATH_TOMBS = pathlib.Path(STORE_ROOT, 'tombs')

35STORE_PATH_PROXY = pathlib.Path(STORE_ROOT, 'proxy')

37XZ_FILTERS = [{'id': lzma.FILTER_LZMA2, 'preset': 7 | lzma.PRESET_EXTREME}]

38XZ_EXT = '.xz'

41def archive(stream, file_path):

42 """Create .xz files for long term storage."""

43 file_path = pathlib.Path(file_path) # HACK A DID ACK

44 if file_path.suffixes[-1] != XZ_EXT:

45 file_path = file_path.with_suffix(file_path.suffix + XZ_EXT)

46 with lzma.open(file_path, 'w', check=lzma.CHECK_SHA256, filters=XZ_FILTERS) as f:

47 for entry in stream:

48 f.write(entry.encode(encoding=ENCODING, errors=ENCODING_ERRORS_POLICY))

51def load(proxy_db_path):

52 """Load the proxy data as dict."""

53 file_path = pathlib.Path(proxy_db_path) # HACK A DID ACK

54 if file_path.is_dir():

55 file_path = pathlib.Path(sorted(file_path.glob('*'))[-1])

56 print(f'Found latest proxy source as {file_path}')

57 else:

58 print(f'Recieved latest proxy source as {file_path}')

60 if not file_path.is_file():

61 raise FileNotFoundError('bootstap f proxy file reading failed.')

62 if file_path.suffixes[-1] == XZ_EXT:

63 with lzma.open(file_path, mode='rt', encoding=ENCODING, errors=ENCODING_ERRORS_POLICY) as handle:

64 return {row[0]: row[0:] for row in csv.reader(handle, delimiter=',', quotechar='|')}

65 else:

66 with open(file_path, newline='') as handle:

67 return {row[0]: row[0:] for row in csv.reader(handle, delimiter=',', quotechar='|')}

70def by_name(text, hash_length):

71 """Fast and shallow hash rep validity probe."""

72 hash_rep_length, base = hash_length, 16

73 if len(text) != hash_rep_length:

74 return False

75 try:

76 _ = int(text, base)

77 except ValueError:

78 return False

79 return True

82def possible_hash(text, hash_policy=HASH_POLICY):

83 """Fast and shallow hash rep validity probe."""

84 probe = {

85 HASH_POLICY: 64,

86 'sha1': 40,

87 }

88 return by_name(text, probe[hash_policy])

91def naive_timestamp(timestamp=None):

92 """Logging helper."""

93 if timestamp:

94 return timestamp.strftime(TS_FORMAT_HR)

95 return dti.datetime.now().strftime(TS_FORMAT_HR)

98def db_timestamp(timestamp=None):

99 """Logging helper."""

100 if timestamp:

101 return timestamp.strftime(TS_FORMAT_DB)

102 return dti.datetime.now().strftime(TS_FORMAT_DB)

103

104

105def spider_tree(base_path):

106 """Visit all elements in the folders below base path and yeld count."""

107 return sum(1 for _ in pathlib.Path(base_path).rglob('**/*'))

108

109

110def walk_files(base_path):

111 """Visit the files in the folders below base path in sorted order."""

112 for file_path in sorted(base_path.rglob('**/*')):

113 yield file_path

114

115

116def elf_hash(some_bytes: bytes):

117 """The ELF hash (Extremely Lossy Function - also used in ELF format).

118 unsigned long ElfHash(const unsigned char *s) {

119 unsigned long h = 0, high;

120 while (*s) {

121 h = (h << 4) + *s++;

122 if (high = h & 0xF0000000)

123 h ^= high >> 24;

124 h &= ~high;

125 }

126 return h;

127 }

128 """

129 h = 0

130 for s in some_bytes:

131 h = (h << 4) + s

132 high = h & 0xF0000000

133 if high:

134 h ^= high >> 24

135 h &= ~high

136 return h

137

138

139def hash_file(path_string):

140 """Yield hashes of implicit algorithm of path."""

141 path = pathlib.Path(path_string)

142 if not path.is_file():

143 raise IOError('path is no file for hashing.')

144

145 value = hashlib.sha256()

146 with open(path, 'rb') as in_file:

147 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''):

148 value.update(byte_block)

149

150 return value.hexdigest()

151

152

153def count_lines(path_string):

154 """Yield number of newline chars (\\n) of path."""

155 path = pathlib.Path(path_string)

156 if not path.is_file():

157 raise IOError('path is no file for line count.')

158

159 value = 0

160 with open(path, 'rb') as in_file:

161 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''):

162 value += byte_block.count(b'\n')

163

164 return value

165

166

167def hashes(path_string, algorithms=None):

168 """Yield hashes per algorithms of path."""

169 if algorithms is None:

170 algorithms = {HASH_POLICY: hashlib.sha256}

171 for key in algorithms:

172 if key not in HASH_POLICIES_KNOWN:

173 raise ValueError('hashes received unexpected algorithm key.')

174

175 path = pathlib.Path(path_string)

176 if not path.is_file():

177 raise IOError('path is no file.')

178

179 accumulator = {k: f() for k, f in algorithms.items()}

180 with open(path, 'rb') as in_file:

181 for byte_block in iter(lambda in_f=in_file: in_f.read(BUFFER_BYTES), b''):

182 for k in algorithms:

183 accumulator[k].update(byte_block)

184

185 return {k: f.hexdigest() for k, f in accumulator.items()}

186

187

188def file_metrics(file_path):

189 """Retrieve the file stats."""

190 return file_path.stat()

191

192

193def mime_type(file_path):

194 """Either yield mime type from find command without file name in result or arti/choke"""

195 find_type = ['file', '--mime', file_path]

196 try:

197 output = subprocess.check_output(find_type, stderr=subprocess.STDOUT).decode() # nosec B603

198 if not output.strip().endswith('(No such file or directory)'):

199 return output.strip().split(':', 1)[1].strip()

200 except subprocess.CalledProcessError:

201 pass # for now

202 return 'arti/choke'

203

204

205def serialize(storage_hash, line_count, f_stat, fps, file_type):

206 """x""" # TODO(sthagen) round trip has become a mess - fix it

207 size_bytes, c_time, m_time = f_stat.st_size, f_stat.st_ctime, f_stat.st_mtime

208 return f"{','.join((storage_hash, str(line_count), str(size_bytes), str(c_time), str(m_time), fps, file_type))}\n"

209

210

211def gen_out_stream(kind):

212 """DRY"""

213 for v in kind.values():

214 yield f"{','.join(v)}\n"

215

216

217def derive_fingerprints(algorithms, file_path):

218 fingerprints = hashes(file_path, algorithms)

219 fps = f'{",".join([f"{k}:{v}" for k, v in fingerprints.items()])}'

220 return fps

221

222

223def visit_store(at, hash_policy, algorithms, enter, proxy, update):

224 """Here we walk the tree and dispatch."""

225 mime_types = collections.Counter()

226 mime_sizes, mime_lines = {}, {}

227 found_bytes, public, private, non_file = 0, 0, 0, 0

228 for file_path in walk_files(pathlib.Path(at)):

229 if file_path.name.startswith('.') or '/.' in str(file_path):

230 private += 1

231 continue

232 public += 1

233 storage_name = str(file_path)

234 if not file_path.is_file():

235 non_file += 1

236 continue

237 storage_hash = hash_file(file_path)

238 f_stat = file_metrics(file_path)

239 mt = mime_type(file_path)

240 mime, charset = '', ''

241 if ';' in mt:

242 mime, charset = mt.split(';', 1)

243 line_count = None

244 if mime.startswith('text/') or mime.endswith('xml') or mime.endswith('script'):

245 line_count = count_lines(file_path)

246 if mime not in mime_lines:

247 mime_lines[mime] = 0

248 mime_lines[mime] += line_count

249

250 mime_types.update([mt])

251 if mt not in mime_sizes:

252 mime_sizes[mt] = 0

253 mime_sizes[mt] += f_stat.st_size

254

255 if storage_name not in proxy:

256 found_bytes += f_stat.st_size

257 enter[storage_name] = (

258 storage_name,

259 str(line_count),

260 str(f_stat.st_size),

261 str(f_stat.st_ctime),

262 str(f_stat.st_mtime),

263 f'sha256:{storage_hash}',

264 mt,

265 )

266 else:

267 update.add(storage_name)

268 return found_bytes, public, private, non_file, mime_types, mime_sizes, mime_lines

269

270

271def distribute_changes(enter, leave, keep, proxy, update):

272 entered_bytes, ignored_bytes, updated_bytes, left_bytes = 0, 0, 0, 0

273 for k, v in proxy.items():

274 if k in update:

275 ignored_bytes += int(v[2])

276 keep[k] = copy.deepcopy(v)

277 else:

278 left_bytes += int(v[2])

279 leave[k] = copy.deepcopy(v)

280 updated_bytes += ignored_bytes

281 for k, v in enter.items():

282 entered_bytes += int(v[2])

283 keep[k] = copy.deepcopy(v)

284 updated_bytes += entered_bytes

285 return entered_bytes, ignored_bytes, left_bytes, updated_bytes

286

287

288def derive_proxy_paths(start_ts):

289 added_db = pathlib.Path(STORE_PATH_ENTER, f'added-{db_timestamp(start_ts)}.csv')

290 proxy_db = pathlib.Path(STORE_PATH_PROXY, f'proxy-{db_timestamp(start_ts)}.csv')

291 gone_db = pathlib.Path(STORE_PATH_TOMBS, f'gone-{db_timestamp(start_ts)}.csv')

292 pathlib.Path(STORE_PATH_ENTER).mkdir(parents=True, exist_ok=True)

293 pathlib.Path(STORE_PATH_PROXY).mkdir(parents=True, exist_ok=True)

294 pathlib.Path(STORE_PATH_TOMBS).mkdir(parents=True, exist_ok=True)

295 return added_db, gone_db, proxy_db

296

297

298def process(tree_root):

299 """Drive the tree visitor."""

300

301 hash_policy = HASH_POLICY

302 proxy_db = os.getenv(PROXY_DB, '')

303 proxy_db_path = proxy_db if proxy_db else STORE_PATH_PROXY

304 report = {

305 'pwd': str(pathlib.Path.cwd()),

306 'proxy_db_path': str(proxy_db_path),

307 'store_root': str(STORE_ROOT),

308 }

309 try:

310 proxy = load(proxy_db_path)

311 except FileNotFoundError:

312 proxy = {}

313 print(

314 f'Warning: Initializing proxy databases below {STORE_ROOT} as no path given per {PROXY_DB} or load failed'

315 )

316

317 previous = len(proxy)

318 enter, update, leave = {}, set(), {}

319 at_or_below = tree_root

320

321 try:

322 repo = Repo(at_or_below)

323 except InvalidGitRepositoryError:

324 print(f'Warning: Tree {at_or_below} is not under version control')

325 repo = None

326 branch_name = None

327 revision = None

328 revision_date = None

329 origin_url = None

330 if repo:

331 try:

332 origin_url = repo.remotes.origin.url

333 except AttributeError:

334 print(f'Warning: Repository {at_or_below} has no remote')

335 pass

336 branch_name = repo.active_branch.name

337 try:

338 revision = repo.head.commit

339 revision_date = naive_timestamp(dti.datetime.utcfromtimestamp(revision.committed_date))

340 except ValueError:

341 pass

342

343 report['origin_url'] = origin_url

344 report['branch_name'] = branch_name

345 report['revision_hash'] = str(revision) if revision else None

346 report['revision_date'] = revision_date

347 report['previous'] = str(previous)

348 report['at_or_below'] = str(at_or_below)

349

350 algorithms = None

351 if hash_policy != HASH_POLICY:

352 algorithms = {

353 HASH_POLICY: hashlib.sha256,

354 }

355 print(f'Warning: Store seems to not use ({HASH_POLICY}) - using ({HASH_POLICY})')

356

357 start_ts = dti.datetime.now()

358 report['spider_start'] = naive_timestamp(start_ts)

359 stream_size = spider_tree(at_or_below)

360 report['visitor_start'] = naive_timestamp()

361 found_bytes, public, private, non_file, mt_c, mt_s, mt_l = visit_store(

362 at_or_below, hash_policy, algorithms, enter, proxy, update

363 )

364

365 keep = {}

366 entered_bytes, ignored_bytes, left_bytes, updated_bytes = distribute_changes(enter, leave, keep, proxy, update)

367

368 added_db, gone_db, proxy_db = derive_proxy_paths(start_ts)

369

370 report['mime_file_counter'] = str(pathlib.Path(STORE_ROOT, 'mime-counts.json'))

371 report['mime_size_bytes'] = str(pathlib.Path(STORE_ROOT, 'mime-sizes-bytes.json'))

372 report['mime_line_counts'] = str(pathlib.Path(STORE_ROOT, 'mime-line-counts.json'))

Coverage for lumikko/tree_shape.py: 0.00%

287 statements