Automate metadata collection for all DBs
Several errors are handled; on extraction
ExtractNotZipfile:
ExtractNoCQLDB:
On detail extraction
DetailsMissing:
This commit is contained in:
committed by
=Michael Hohn
parent
129b8cc302
commit
aaeafa9e88
@@ -26,18 +26,21 @@ dtale.show(dbdf_1)
|
|||||||
#
|
#
|
||||||
#* Collect metadata from DB zip files
|
#* Collect metadata from DB zip files
|
||||||
#
|
#
|
||||||
|
#** A manual sample
|
||||||
|
#
|
||||||
d = dbdf_1
|
d = dbdf_1
|
||||||
|
left_index = 0
|
||||||
d.path[0]
|
d.path[0]
|
||||||
idb, ibl = extract_metadata(d.path[0])
|
cqlc, metac = extract_metadata(d.path[0])
|
||||||
|
|
||||||
idb['baselineLinesOfCode']
|
cqlc['baselineLinesOfCode']
|
||||||
idb['primaryLanguage']
|
cqlc['primaryLanguage']
|
||||||
idb['creationMetadata']['sha']
|
cqlc['creationMetadata']['sha']
|
||||||
idb['creationMetadata']['cliVersion']
|
cqlc['creationMetadata']['cliVersion']
|
||||||
idb['creationMetadata']['creationTime'].isoformat()
|
cqlc['creationMetadata']['creationTime'].isoformat()
|
||||||
idb['finalised']
|
cqlc['finalised']
|
||||||
|
|
||||||
for lang, lang_cont in ibl['languages'].items():
|
for lang, lang_cont in metac['languages'].items():
|
||||||
print(lang)
|
print(lang)
|
||||||
indent = " "
|
indent = " "
|
||||||
for prop, val in lang_cont.items():
|
for prop, val in lang_cont.items():
|
||||||
@@ -48,6 +51,28 @@ for lang, lang_cont in ibl['languages'].items():
|
|||||||
elif prop == 'displayName':
|
elif prop == 'displayName':
|
||||||
print("%sdisplayName %s" % (indent, val))
|
print("%sdisplayName %s" % (indent, val))
|
||||||
|
|
||||||
|
#** Automated for all entries
|
||||||
|
d = dbdf_1
|
||||||
|
joiners = []
|
||||||
|
for left_index in range(0, len(d)-1):
|
||||||
|
try:
|
||||||
|
cqlc, metac = extract_metadata(d.path[left_index])
|
||||||
|
except ExtractNotZipfile:
|
||||||
|
continue
|
||||||
|
except ExtractNoCQLDB:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
detail_df = metadata_details(left_index, cqlc, metac)
|
||||||
|
except DetailsMissing:
|
||||||
|
continue
|
||||||
|
joiners.append(detail_df)
|
||||||
|
joiners_df = pd.concat(joiners, axis=0)
|
||||||
|
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
||||||
|
|
||||||
|
#** View the full dataframe with metadata
|
||||||
|
from pandasgui import show
|
||||||
|
os.environ['APPDATA'] = "needed-for-pandasgui"
|
||||||
|
show(full_df)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Local Variables:
|
# Local Variables:
|
||||||
|
|||||||
@@ -30,6 +30,10 @@ def log_and_raise(message):
|
|||||||
logging.error(message)
|
logging.error(message)
|
||||||
raise Exception(message)
|
raise Exception(message)
|
||||||
|
|
||||||
|
def log_and_raise_e(message, exception):
|
||||||
|
logging.error(message)
|
||||||
|
raise exception(message)
|
||||||
|
|
||||||
def traverse_tree(root):
|
def traverse_tree(root):
|
||||||
root_path = Path(os.path.expanduser(root))
|
root_path = Path(os.path.expanduser(root))
|
||||||
if not root_path.exists() or not root_path.is_dir():
|
if not root_path.exists() or not root_path.is_dir():
|
||||||
@@ -83,16 +87,95 @@ def dbdf_from_tree():
|
|||||||
def extract_metadata(zipfile_path):
|
def extract_metadata(zipfile_path):
|
||||||
codeql_content = None
|
codeql_content = None
|
||||||
meta_content = None
|
meta_content = None
|
||||||
with zipfile.ZipFile(zipfile_path, 'r') as z:
|
# Files may not be zip files:
|
||||||
for file_info in z.infolist():
|
# {"message":"Repository was archived so is read-only.",
|
||||||
if file_info.filename == 'codeql_db/codeql-database.yml':
|
# "documentation_url":"https://docs.github.com/rest/code-scanning/code-scanning#get-a-codeql-database-for-a-repository"}
|
||||||
with z.open(file_info) as f:
|
#
|
||||||
codeql_content = yaml.safe_load(f)
|
try:
|
||||||
elif file_info.filename == 'codeql_db/baseline-info.json':
|
with zipfile.ZipFile(zipfile_path, 'r') as z:
|
||||||
with z.open(file_info) as f:
|
for file_info in z.infolist():
|
||||||
meta_content = json.load(f)
|
# Filenames seen
|
||||||
|
# java/codeql-database.yml
|
||||||
|
# codeql_db/codeql-database.yml
|
||||||
|
if file_info.filename.endswith('codeql-database.yml'):
|
||||||
|
with z.open(file_info) as f:
|
||||||
|
codeql_content = yaml.safe_load(f)
|
||||||
|
# And
|
||||||
|
# java/baseline-info.json
|
||||||
|
# codeql_db/baseline-info.json
|
||||||
|
elif file_info.filename.endswith('baseline-info.json'):
|
||||||
|
with z.open(file_info) as f:
|
||||||
|
meta_content = json.load(f)
|
||||||
|
except zipfile.BadZipFile:
|
||||||
|
log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNotZipfile)
|
||||||
|
# The baseline-info is only available in more recent CodeQL versions
|
||||||
|
if not meta_content:
|
||||||
|
meta_content = {'languages':
|
||||||
|
{'no-language': {'displayName': 'no-language',
|
||||||
|
'files': [],
|
||||||
|
'linesOfCode': -1,
|
||||||
|
'name': 'nolang'},
|
||||||
|
}}
|
||||||
|
|
||||||
|
if not codeql_content:
|
||||||
|
log_and_raise_e(f"Not a zipfile: '{zipfile_path}'", ExtractNoCQLDB)
|
||||||
return codeql_content, meta_content
|
return codeql_content, meta_content
|
||||||
|
|
||||||
|
class ExtractNotZipfile(Exception): pass
|
||||||
|
class ExtractNoCQLDB(Exception): pass
|
||||||
|
|
||||||
|
# metadata_details(codeql_content, meta_content)
|
||||||
|
#
|
||||||
|
# Extract the details from metadata that will be used in DB selection and return a
|
||||||
|
# dataframe with the information. Example, cropped to fit:
|
||||||
|
#
|
||||||
|
# full_df.T
|
||||||
|
# Out[535]:
|
||||||
|
# 0 1
|
||||||
|
# left_index 0 0
|
||||||
|
# baselineLinesOfCode 17990 17990
|
||||||
|
# primaryLanguage cpp cpp
|
||||||
|
# sha 288920efc079766f4 282c20efc079766f4
|
||||||
|
# cliVersion 2.17.0 2.17.0
|
||||||
|
# creationTime .325253+00:00 51.325253+00:00
|
||||||
|
# finalised True True
|
||||||
|
# db_lang cpp python
|
||||||
|
# db_lang_displayName C/C++ Python
|
||||||
|
# db_lang_file_count 102 27
|
||||||
|
# db_lang_linesOfCode 17990 5586
|
||||||
|
#
|
||||||
|
def metadata_details(left_index, codeql_content, meta_content):
|
||||||
|
cqlc, metac = codeql_content, meta_content
|
||||||
|
d = {'left_index': left_index,
|
||||||
|
'baselineLinesOfCode': cqlc['baselineLinesOfCode'],
|
||||||
|
'primaryLanguage': cqlc['primaryLanguage'],
|
||||||
|
'sha': cqlc['creationMetadata'].get('sha', 'abcde0123'),
|
||||||
|
'cliVersion': cqlc['creationMetadata']['cliVersion'],
|
||||||
|
'creationTime': cqlc['creationMetadata']['creationTime'],
|
||||||
|
'finalised': cqlc.get('finalised', pd.NA),
|
||||||
|
}
|
||||||
|
f = pd.DataFrame(d, index=[0])
|
||||||
|
joiners = []
|
||||||
|
if not ('languages' in metac):
|
||||||
|
log_and_raise_e("Missing 'languages' in metadata", DetailsMissing)
|
||||||
|
for lang, lang_cont in metac['languages'].items():
|
||||||
|
d1 = { 'left_index' : left_index,
|
||||||
|
'db_lang': lang }
|
||||||
|
for prop, val in lang_cont.items():
|
||||||
|
if prop == 'files':
|
||||||
|
d1['db_lang_file_count'] = len(val)
|
||||||
|
elif prop == 'linesOfCode':
|
||||||
|
d1['db_lang_linesOfCode'] = val
|
||||||
|
elif prop == 'displayName':
|
||||||
|
d1['db_lang_displayName'] = val
|
||||||
|
joiners.append(d1)
|
||||||
|
fj = pd.DataFrame(joiners)
|
||||||
|
full_df = pd.merge(f, fj, on='left_index', how='outer')
|
||||||
|
return full_df
|
||||||
|
|
||||||
|
class DetailsMissing(Exception): pass
|
||||||
|
|
||||||
|
|
||||||
# Local Variables:
|
# Local Variables:
|
||||||
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
||||||
# End:
|
# End:
|
||||||
|
|||||||
Reference in New Issue
Block a user