123 lines
3.6 KiB
Python
Executable File
123 lines
3.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
""" Read a table of CodeQL DB information and produce a table with unique entries
|
|
adding the Cumulative ID (CID) column.
|
|
|
|
To make this happen:
|
|
- Group entries by (owner,name,CID),
|
|
sort each group by creationTime,
|
|
and keep only the top (newest) element.
|
|
|
|
- Drop rows that don't have the
|
|
| cliVersion |
|
|
| creationTime |
|
|
| language |
|
|
| sha |
|
|
columns. There are very few (16 out of 6000 on recent tests) and their DBs
|
|
are quesionable.
|
|
|
|
"""
|
|
import argparse
|
|
import logging
|
|
from argparse import Namespace
|
|
from typing import Any
|
|
|
|
from pandas import DataFrame, Series
|
|
|
|
#
|
|
#* Configure logger
|
|
#
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
|
|
# Overwrite log level set by minio
|
|
root_logger = logging.getLogger()
|
|
root_logger.setLevel(logging.INFO)
|
|
|
|
#
|
|
#* Process command line
|
|
#
|
|
parser = argparse.ArgumentParser(
|
|
description=""" Read a table of CodeQL DB information,
|
|
narrow to <language>,
|
|
group entries by (owner,name), sort each group by
|
|
creationTime and keep only the top (newest) element.
|
|
""")
|
|
parser.add_argument('language', type=str,
|
|
help='The language to be analyzed.')
|
|
|
|
args: Namespace = parser.parse_args()
|
|
#
|
|
#* Collect the information and select subset
|
|
#
|
|
import pandas as pd
|
|
import sys
|
|
import qldbtools.utils as utils
|
|
|
|
df2: DataFrame = pd.read_csv(sys.stdin)
|
|
|
|
#
|
|
#* Add single uniqueness field -- CID (Cumulative ID)
|
|
#
|
|
df2['CID'] = df2.apply(lambda row:
|
|
utils.cid_hash((
|
|
row['cliVersion'],
|
|
row['creationTime'],
|
|
row['language'],
|
|
row['sha'],
|
|
)), axis=1)
|
|
|
|
#
|
|
#* Re-order the dataframe columns by importance
|
|
# - Much of the data
|
|
# 1. Is only conditionally present
|
|
# 2. Is extra info, not for the DB proper
|
|
# 3. May have various names
|
|
#
|
|
# - The essential columns are
|
|
# | owner |
|
|
# | name |
|
|
# | language |
|
|
# | size |
|
|
# | cliVersion |
|
|
# | creationTime |
|
|
# | sha |
|
|
# | baselineLinesOfCode |
|
|
# | path |
|
|
#
|
|
# - The rest are useful; put them last
|
|
# | db_lang |
|
|
# | db_lang_displayName |
|
|
# | db_lang_file_count |
|
|
# | db_lang_linesOfCode |
|
|
# | left_index |
|
|
# | ctime |
|
|
# | primaryLanguage |
|
|
# | finalised |
|
|
|
|
df3: DataFrame = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
|
|
'language', 'sha','CID',
|
|
'baselineLinesOfCode', 'path', 'db_lang',
|
|
'db_lang_displayName', 'db_lang_file_count',
|
|
'db_lang_linesOfCode', 'ctime',
|
|
'primaryLanguage', 'finalised', 'left_index',
|
|
'size'])
|
|
|
|
# Identify rows missing specific entries
|
|
rows = ( df3['cliVersion'].isna() |
|
|
df3['creationTime'].isna() |
|
|
df3['language'].isna() |
|
|
df3['sha'].isna() )
|
|
df4: DataFrame = df3[~rows]
|
|
|
|
# Limit to one language
|
|
df5 = df4[df4['language'] == args.language]
|
|
|
|
# Sort and group
|
|
df_sorted: DataFrame = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
|
|
df_unique: DataFrame = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
|
|
|
|
# Write output
|
|
df_unique.to_csv(sys.stdout, index=False)
|
|
|
|
# Local Variables:
|
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
|
# End:
|