From aaeafa9e88bc19a158039453adc466dff0649510 Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Mon, 22 Jul 2024 19:12:12 -0700 Subject: [PATCH] Automate metadata collection for all DBs Several errors are handled; on extraction ExtractNotZipfile: ExtractNoCQLDB: On detail extraction DetailsMissing: --- client/qldbtools/qldbtools/session2.py | 41 ++++++++--- client/qldbtools/qldbtools/utils.py | 99 +++++++++++++++++++++++--- 2 files changed, 124 insertions(+), 16 deletions(-) diff --git a/client/qldbtools/qldbtools/session2.py b/client/qldbtools/qldbtools/session2.py index 1010d7e..6318ef7 100644 --- a/client/qldbtools/qldbtools/session2.py +++ b/client/qldbtools/qldbtools/session2.py @@ -26,18 +26,21 @@ dtale.show(dbdf_1) # #* Collect metadata from DB zip files # +#** A manual sample +# d = dbdf_1 +left_index = 0 d.path[0] -idb, ibl = extract_metadata(d.path[0]) +cqlc, metac = extract_metadata(d.path[0]) -idb['baselineLinesOfCode'] -idb['primaryLanguage'] -idb['creationMetadata']['sha'] -idb['creationMetadata']['cliVersion'] -idb['creationMetadata']['creationTime'].isoformat() -idb['finalised'] +cqlc['baselineLinesOfCode'] +cqlc['primaryLanguage'] +cqlc['creationMetadata']['sha'] +cqlc['creationMetadata']['cliVersion'] +cqlc['creationMetadata']['creationTime'].isoformat() +cqlc['finalised'] -for lang, lang_cont in ibl['languages'].items(): +for lang, lang_cont in metac['languages'].items(): print(lang) indent = " " for prop, val in lang_cont.items(): @@ -48,6 +51,28 @@ for lang, lang_cont in ibl['languages'].items(): elif prop == 'displayName': print("%sdisplayName %s" % (indent, val)) +#** Automated for all entries +d = dbdf_1 +joiners = [] +for left_index in range(0, len(d)-1): + try: + cqlc, metac = extract_metadata(d.path[left_index]) + except ExtractNotZipfile: + continue + except ExtractNoCQLDB: + continue + try: + detail_df = metadata_details(left_index, cqlc, metac) + except DetailsMissing: + continue + joiners.append(detail_df) +joiners_df = pd.concat(joiners, axis=0) +full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') + +#** View the full dataframe with metadata +from pandasgui import show +os.environ['APPDATA'] = "needed-for-pandasgui" +show(full_df) # # Local Variables: diff --git a/client/qldbtools/qldbtools/utils.py b/client/qldbtools/qldbtools/utils.py index 4ffb55d..203e923 100644 --- a/client/qldbtools/qldbtools/utils.py +++ b/client/qldbtools/qldbtools/utils.py @@ -30,6 +30,10 @@ def log_and_raise(message): logging.error(message) raise Exception(message) +def log_and_raise_e(message, exception): + logging.error(message) + raise exception(message) + def traverse_tree(root): root_path = Path(os.path.expanduser(root)) if not root_path.exists() or not root_path.is_dir(): @@ -83,16 +87,95 @@ def dbdf_from_tree(): def extract_metadata(zipfile_path): codeql_content = None meta_content = None - with zipfile.ZipFile(zipfile_path, 'r') as z: - for file_info in z.infolist(): - if file_info.filename == 'codeql_db/codeql-database.yml': - with z.open(file_info) as f: - codeql_content = yaml.safe_load(f) - elif file_info.filename == 'codeql_db/baseline-info.json': - with z.open(file_info) as f: - meta_content = json.load(f) + # Files may not be zip files: + # {"message":"Repository was archived so is read-only.", + # "documentation_url":"https://docs.github.com/rest/code-scanning/code-scanning#get-a-codeql-database-for-a-repository"} + # + try: + with zipfile.ZipFile(zipfile_path, 'r') as z: + for file_info in z.infolist(): + # Filenames seen + # java/codeql-database.yml + # codeql_db/codeql-database.yml + if file_info.filename.endswith('codeql-database.yml'): + with z.open(file_info) as f: + codeql_content = yaml.safe_load(f) + # And + # java/baseline-info.json + # codeql_db/baseline-info.json + elif file_info.filename.endswith('baseline-info.json'): + with z.open(file_info) as f: + meta_content = json.load(f) + except zipfile.BadZipFile: + log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNotZipfile) + # The baseline-info is only available in more recent CodeQL versions + if not meta_content: + meta_content = {'languages': + {'no-language': {'displayName': 'no-language', + 'files': [], + 'linesOfCode': -1, + 'name': 'nolang'}, + }} + + if not codeql_content: + log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNoCQLDB) return codeql_content, meta_content +class ExtractNotZipfile(Exception): pass +class ExtractNoCQLDB(Exception): pass + +# metadata_details(codeql_content, meta_content) +# +# Extract the details from metadata that will be used in DB selection and return a +# dataframe with the information. Example, cropped to fit: +# +# full_df.T +# Out[535]: +# 0 1 +# left_index 0 0 +# baselineLinesOfCode 17990 17990 +# primaryLanguage cpp cpp +# sha 288920efc079766f4 282c20efc079766f4 +# cliVersion 2.17.0 2.17.0 +# creationTime .325253+00:00 51.325253+00:00 +# finalised True True +# db_lang cpp python +# db_lang_displayName C/C++ Python +# db_lang_file_count 102 27 +# db_lang_linesOfCode 17990 5586 +# +def metadata_details(left_index, codeql_content, meta_content): + cqlc, metac = codeql_content, meta_content + d = {'left_index': left_index, + 'baselineLinesOfCode': cqlc['baselineLinesOfCode'], + 'primaryLanguage': cqlc['primaryLanguage'], + 'sha': cqlc['creationMetadata'].get('sha', 'abcde0123'), + 'cliVersion': cqlc['creationMetadata']['cliVersion'], + 'creationTime': cqlc['creationMetadata']['creationTime'], + 'finalised': cqlc.get('finalised', pd.NA), + } + f = pd.DataFrame(d, index=[0]) + joiners = [] + if not ('languages' in metac): + log_and_raise_e("Missing 'languages' in metadata", DetailsMissing) + for lang, lang_cont in metac['languages'].items(): + d1 = { 'left_index' : left_index, + 'db_lang': lang } + for prop, val in lang_cont.items(): + if prop == 'files': + d1['db_lang_file_count'] = len(val) + elif prop == 'linesOfCode': + d1['db_lang_linesOfCode'] = val + elif prop == 'displayName': + d1['db_lang_displayName'] = val + joiners.append(d1) + fj = pd.DataFrame(joiners) + full_df = pd.merge(f, fj, on='left_index', how='outer') + return full_df + +class DetailsMissing(Exception): pass + + # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # End: