mrvacommander/client/qldbtools/bin/mc-db-unique

#!/usr/bin/env python
""" Read a table of CodeQL DB information and produce a table with unique entries
    adding the Cumulative ID (CID) column.

    To make this happen:
    - Group entries by (owner,name,CID),
      sort each group by creationTime,
      and keep only the top (newest) element.

    - Drop rows that don't have the
          | cliVersion   |
          | creationTime |
          | language     |
          | sha          |
      columns.  There are very few (16 out of 6000 on recent tests) and their DBs
      are quesionable.

"""
import argparse
import logging
from argparse import Namespace
from typing import Any

from pandas import DataFrame, Series

#
#* Configure logger
#
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
# Overwrite log level set by minio
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)

#
#* Process command line
#
parser = argparse.ArgumentParser(
    description=""" Read a table of CodeQL DB information,
    narrow to <language>,
    group entries by (owner,name),  sort each group by
    creationTime and keep only the top (newest) element.
    """)
parser.add_argument('language', type=str,
                    help='The language to be analyzed.')

args: Namespace = parser.parse_args()
#
#* Collect the information and select subset
#
import pandas as pd
import sys
import qldbtools.utils as utils

df2: DataFrame = pd.read_csv(sys.stdin)

#
#* Add single uniqueness field -- CID (Cumulative ID)
#
df2['CID'] = df2.apply(lambda row:
                       utils.cid_hash((
                           row['cliVersion'],
                           row['creationTime'],
                           row['language'],
                           row['sha'],
                       )), axis=1)

#
#* Re-order the dataframe columns by importance
# - Much of the data
#   1. Is only conditionally present
#   2. Is extra info, not for the DB proper
#   3. May have various names
#
# - The essential columns are
#     | owner               |
#     | name                |
#     | language            |
#     | size                |
#     | cliVersion          |
#     | creationTime        |
#     | sha                 |
#     | baselineLinesOfCode |
#     | path                |
#
# - The rest are useful; put them last
#     | db_lang             |
#     | db_lang_displayName |
#     | db_lang_file_count  |
#     | db_lang_linesOfCode |
#     | left_index          |
#     | ctime               |
#     | primaryLanguage     |
#     | finalised           |

df3: DataFrame = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
                                     'language', 'sha','CID',
                                     'baselineLinesOfCode', 'path', 'db_lang',
                                     'db_lang_displayName', 'db_lang_file_count',
                                     'db_lang_linesOfCode', 'ctime',
                                     'primaryLanguage', 'finalised', 'left_index',
                                     'size'])

# Identify rows missing specific entries
rows = ( df3['cliVersion'].isna() |
         df3['creationTime'].isna() |
         df3['language'].isna() |
         df3['sha'].isna() )
df4: DataFrame = df3[~rows]

# Limit to one language
df5 = df4[df4['language'] == args.language]

# Sort and group
df_sorted: DataFrame = df5.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique: DataFrame = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()

# Write output
df_unique.to_csv(sys.stdout, index=False)

# Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: