Files
mrvacommander/client/qldbtools/bin/mc-db-unique
2024-08-09 08:36:48 -07:00

119 lines
3.5 KiB
Python
Executable File

#!/usr/bin/env python
""" Read a table of CodeQL DB information and produce a table with unique entries
adding the Cumulative ID (CID) column.
To make this happen:
- Group entries by (owner,name,CID),
sort each group by creationTime,
and keep only the top (newest) element.
- Drop rows that don't have the
| cliVersion |
| creationTime |
| language |
| sha |
columns. There are very few (16 out of 6000 on recent tests) and their DBs
are quesionable.
"""
import argparse
import logging
#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
#
#* Process command line
#
parser = argparse.ArgumentParser(
description=""" Read a table of CodeQL DB information,
narrow to <language>,
group entries by (owner,name), sort each group by
creationTime and keep only the top (newest) element.
""")
parser.add_argument('language', type=str,
help='The language to be analyzed.')
args = parser.parse_args()
#
#* Collect the information and select subset
#
import pandas as pd
import sys
import qldbtools.utils as utils
df2 = pd.read_csv(sys.stdin)
#
#* Add single uniqueness field -- CID (Cumulative ID)
#
df2['CID'] = df2.apply(lambda row:
utils.cid_hash((
row['cliVersion'],
row['creationTime'],
row['language'],
row['sha'],
)), axis=1)
#
#* Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
#
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
#
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID',
'baselineLinesOfCode', 'path', 'db_lang',
'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime',
'primaryLanguage', 'finalised', 'left_index',
'size'])
# Identify rows missing specific entries
rows = ( df3['cliVersion'].isna() |
df3['creationTime'].isna() |
df3['language'].isna() |
df3['sha'].isna() )
df4 = df3[~rows]
# XX: Limit to one language
df5 = df4[df4['language'] == args.language]
# Sort and group
df_sorted = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
# Write output
df_unique.to_csv(sys.stdout, index=False)
# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: