The current api (<2024-07-26 Fri>) is set up only for (owner,name). This is
insufficient for distinguishing CodeQL databases.
Other differences must be considered; this patch combines the fields
| cliVersion |
| creationTime |
| language |
| sha |
into one called CID. The CID field is a hash of these others and therefore can be
changed in the future without affecting workflows or the server.
The cid is combined with the owner/name to form one
identifier. This requires no changes to server or client -- the db
selection's interface is separate from VS Code and gh-mrva in any case.
To test this, this version imports multiple versions of the same owner/repo pairs from multiple directories. In this case, from
~/work-gh/mrva/mrva-open-source-download/repos
and
~/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/
The unique database count increases from 3000 to 5360 -- see README.md,
./bin/mc-db-view-info < db-info-3.csv &
Other code modifications:
- Push (owner,repo,cid) names to minio
- Generate databases.json for use in vs code extension
- Generate list-databases.json for use by gh-mrva client
94 lines
3.1 KiB
Python
Executable File
94 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python
|
|
""" Read an initial table of CodeQL DB information, produced by
|
|
mc-db-initial-info, and collect more detailed information from the database
|
|
files. Write out an extended table in CSV format.
|
|
"""
|
|
import qldbtools.utils as utils
|
|
import argparse
|
|
import logging
|
|
import pandas as pd
|
|
import sys
|
|
|
|
#
|
|
#* Configure logger
|
|
#
|
|
logging.basicConfig(format='%(asctime)s %(message)s')
|
|
|
|
#
|
|
#* Process command line
|
|
#
|
|
parser = argparse.ArgumentParser(
|
|
description="""Read an initial table of CodeQL DB information, produced by
|
|
mc-db-initial-info, and collect more detailed information from the database
|
|
files. Write out an extended table in CSV format. """)
|
|
args = parser.parse_args()
|
|
|
|
#
|
|
#* Collect the information
|
|
#
|
|
d = pd.read_csv(sys.stdin)
|
|
joiners = []
|
|
for left_index in range(0, len(d)-1):
|
|
try:
|
|
cqlc, metac = utils.extract_metadata(d.path[left_index])
|
|
except utils.ExtractNotZipfile:
|
|
continue
|
|
except utils.ExtractNoCQLDB:
|
|
continue
|
|
try:
|
|
detail_df = utils.metadata_details(left_index, cqlc, metac)
|
|
except utils.DetailsMissing:
|
|
continue
|
|
joiners.append(detail_df)
|
|
joiners_df = pd.concat(joiners, axis=0)
|
|
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
|
|
|
|
#** Add single uniqueness field -- CID (Cumulative ID)
|
|
full_df['CID'] = full_df.apply(lambda row:
|
|
utils.cid_hash( (row['creationTime'],
|
|
row['sha'],
|
|
row['cliVersion'],
|
|
row['language'])
|
|
), axis=1)
|
|
|
|
#** Re-order the dataframe columns by importance
|
|
# - Much of the data
|
|
# 1. Is only conditionally present
|
|
# 2. Is extra info, not for the DB proper
|
|
# 3. May have various names
|
|
#
|
|
# - The essential columns are
|
|
# | owner |
|
|
# | name |
|
|
# | language |
|
|
# | size |
|
|
# | cliVersion |
|
|
# | creationTime |
|
|
# | sha |
|
|
# | baselineLinesOfCode |
|
|
# | path |
|
|
#
|
|
# - The rest are useful; put them last
|
|
# | db_lang |
|
|
# | db_lang_displayName |
|
|
# | db_lang_file_count |
|
|
# | db_lang_linesOfCode |
|
|
# | left_index |
|
|
# | ctime |
|
|
# | primaryLanguage |
|
|
# | finalised |
|
|
|
|
final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
|
|
'creationTime', 'language', 'sha','CID',
|
|
'baselineLinesOfCode', 'path', 'db_lang',
|
|
'db_lang_displayName', 'db_lang_file_count',
|
|
'db_lang_linesOfCode', 'ctime',
|
|
'primaryLanguage', 'finalised', 'left_index',
|
|
'size'])
|
|
|
|
final_df.to_csv(sys.stdout, index=False)
|
|
|
|
# Local Variables:
|
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
|
# End:
|