Automate metadata collection for all DBs
Several errors are handled; on extraction
ExtractNotZipfile:
ExtractNoCQLDB:
On detail extraction
DetailsMissing:
This commit is contained in:
committed by
=Michael Hohn
parent
129b8cc302
commit
aaeafa9e88
@@ -26,18 +26,21 @@ dtale.show(dbdf_1)
|
||||
#
|
||||
#* Collect metadata from DB zip files
|
||||
#
|
||||
#** A manual sample
|
||||
#
|
||||
d = dbdf_1
|
||||
left_index = 0
|
||||
d.path[0]
|
||||
idb, ibl = extract_metadata(d.path[0])
|
||||
cqlc, metac = extract_metadata(d.path[0])
|
||||
|
||||
idb['baselineLinesOfCode']
|
||||
idb['primaryLanguage']
|
||||
idb['creationMetadata']['sha']
|
||||
idb['creationMetadata']['cliVersion']
|
||||
idb['creationMetadata']['creationTime'].isoformat()
|
||||
idb['finalised']
|
||||
cqlc['baselineLinesOfCode']
|
||||
cqlc['primaryLanguage']
|
||||
cqlc['creationMetadata']['sha']
|
||||
cqlc['creationMetadata']['cliVersion']
|
||||
cqlc['creationMetadata']['creationTime'].isoformat()
|
||||
cqlc['finalised']
|
||||
|
||||
for lang, lang_cont in ibl['languages'].items():
|
||||
for lang, lang_cont in metac['languages'].items():
|
||||
print(lang)
|
||||
indent = " "
|
||||
for prop, val in lang_cont.items():
|
||||
@@ -48,6 +51,28 @@ for lang, lang_cont in ibl['languages'].items():
|
||||
elif prop == 'displayName':
|
||||
print("%sdisplayName %s" % (indent, val))
|
||||
|
||||
#** Automated for all entries
|
||||
d = dbdf_1
|
||||
joiners = []
|
||||
for left_index in range(0, len(d)-1):
|
||||
try:
|
||||
cqlc, metac = extract_metadata(d.path[left_index])
|
||||
except ExtractNotZipfile:
|
||||
continue
|
||||
except ExtractNoCQLDB:
|
||||
continue
|
||||
try:
|
||||
detail_df = metadata_details(left_index, cqlc, metac)
|
||||
except DetailsMissing:
|
||||
continue
|
||||
joiners.append(detail_df)
|
||||
joiners_df = pd.concat(joiners, axis=0)
|
||||
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
||||
|
||||
#** View the full dataframe with metadata
|
||||
from pandasgui import show
|
||||
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||
show(full_df)
|
||||
|
||||
#
|
||||
# Local Variables:
|
||||
|
||||
@@ -30,6 +30,10 @@ def log_and_raise(message):
|
||||
logging.error(message)
|
||||
raise Exception(message)
|
||||
|
||||
def log_and_raise_e(message, exception):
|
||||
logging.error(message)
|
||||
raise exception(message)
|
||||
|
||||
def traverse_tree(root):
|
||||
root_path = Path(os.path.expanduser(root))
|
||||
if not root_path.exists() or not root_path.is_dir():
|
||||
@@ -83,16 +87,95 @@ def dbdf_from_tree():
|
||||
def extract_metadata(zipfile_path):
|
||||
codeql_content = None
|
||||
meta_content = None
|
||||
with zipfile.ZipFile(zipfile_path, 'r') as z:
|
||||
for file_info in z.infolist():
|
||||
if file_info.filename == 'codeql_db/codeql-database.yml':
|
||||
with z.open(file_info) as f:
|
||||
codeql_content = yaml.safe_load(f)
|
||||
elif file_info.filename == 'codeql_db/baseline-info.json':
|
||||
with z.open(file_info) as f:
|
||||
meta_content = json.load(f)
|
||||
# Files may not be zip files:
|
||||
# {"message":"Repository was archived so is read-only.",
|
||||
# "documentation_url":"https://docs.github.com/rest/code-scanning/code-scanning#get-a-codeql-database-for-a-repository"}
|
||||
#
|
||||
try:
|
||||
with zipfile.ZipFile(zipfile_path, 'r') as z:
|
||||
for file_info in z.infolist():
|
||||
# Filenames seen
|
||||
# java/codeql-database.yml
|
||||
# codeql_db/codeql-database.yml
|
||||
if file_info.filename.endswith('codeql-database.yml'):
|
||||
with z.open(file_info) as f:
|
||||
codeql_content = yaml.safe_load(f)
|
||||
# And
|
||||
# java/baseline-info.json
|
||||
# codeql_db/baseline-info.json
|
||||
elif file_info.filename.endswith('baseline-info.json'):
|
||||
with z.open(file_info) as f:
|
||||
meta_content = json.load(f)
|
||||
except zipfile.BadZipFile:
|
||||
log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNotZipfile)
|
||||
# The baseline-info is only available in more recent CodeQL versions
|
||||
if not meta_content:
|
||||
meta_content = {'languages':
|
||||
{'no-language': {'displayName': 'no-language',
|
||||
'files': [],
|
||||
'linesOfCode': -1,
|
||||
'name': 'nolang'},
|
||||
}}
|
||||
|
||||
if not codeql_content:
|
||||
log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNoCQLDB)
|
||||
return codeql_content, meta_content
|
||||
|
||||
class ExtractNotZipfile(Exception): pass
|
||||
class ExtractNoCQLDB(Exception): pass
|
||||
|
||||
# metadata_details(codeql_content, meta_content)
|
||||
#
|
||||
# Extract the details from metadata that will be used in DB selection and return a
|
||||
# dataframe with the information. Example, cropped to fit:
|
||||
#
|
||||
# full_df.T
|
||||
# Out[535]:
|
||||
# 0 1
|
||||
# left_index 0 0
|
||||
# baselineLinesOfCode 17990 17990
|
||||
# primaryLanguage cpp cpp
|
||||
# sha 288920efc079766f4 282c20efc079766f4
|
||||
# cliVersion 2.17.0 2.17.0
|
||||
# creationTime .325253+00:00 51.325253+00:00
|
||||
# finalised True True
|
||||
# db_lang cpp python
|
||||
# db_lang_displayName C/C++ Python
|
||||
# db_lang_file_count 102 27
|
||||
# db_lang_linesOfCode 17990 5586
|
||||
#
|
||||
def metadata_details(left_index, codeql_content, meta_content):
|
||||
cqlc, metac = codeql_content, meta_content
|
||||
d = {'left_index': left_index,
|
||||
'baselineLinesOfCode': cqlc['baselineLinesOfCode'],
|
||||
'primaryLanguage': cqlc['primaryLanguage'],
|
||||
'sha': cqlc['creationMetadata'].get('sha', 'abcde0123'),
|
||||
'cliVersion': cqlc['creationMetadata']['cliVersion'],
|
||||
'creationTime': cqlc['creationMetadata']['creationTime'],
|
||||
'finalised': cqlc.get('finalised', pd.NA),
|
||||
}
|
||||
f = pd.DataFrame(d, index=[0])
|
||||
joiners = []
|
||||
if not ('languages' in metac):
|
||||
log_and_raise_e("Missing 'languages' in metadata", DetailsMissing)
|
||||
for lang, lang_cont in metac['languages'].items():
|
||||
d1 = { 'left_index' : left_index,
|
||||
'db_lang': lang }
|
||||
for prop, val in lang_cont.items():
|
||||
if prop == 'files':
|
||||
d1['db_lang_file_count'] = len(val)
|
||||
elif prop == 'linesOfCode':
|
||||
d1['db_lang_linesOfCode'] = val
|
||||
elif prop == 'displayName':
|
||||
d1['db_lang_displayName'] = val
|
||||
joiners.append(d1)
|
||||
fj = pd.DataFrame(joiners)
|
||||
full_df = pd.merge(f, fj, on='left_index', how='outer')
|
||||
return full_df
|
||||
|
||||
class DetailsMissing(Exception): pass
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||
# End:
|
||||
|
||||
Reference in New Issue
Block a user