Previously, the refined info was collected and the CID computed before saving. This was a major development time sink, so the CID is now computed in the following step (bin/mc-db-unique). The columns previously chosen for the CID are not enough. If these columns are empty for any reason, the CID repeats. Just including the owner/name won't help, because those are duplicates. Some possibilities considered and rejected: 1. Could use a random number for missing columns. But this makes the CID nondeterministic. 2. Switch to the file system ctime? Not unique across owner/repo pairs, but unique within one. Also, this could be changed externally and cause *very* subtle bugs. 3. Use the file system path? It has to be unique at ingestion time, but repo collections can move. Instead, this patch 4. Drops rows that don't have the | cliVersion | | creationTime | | language | | sha | columns. There are very few (16 out of 6000) and their DBs are quesionable.
113 lines
3.3 KiB
Python
Executable File
113 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
""" Read a table of CodeQL DB information and produce a table with unique entries
|
|
adding the Cumulative ID (CID) column.
|
|
|
|
To make this happen:
|
|
- Group entries by (owner,name,CID),
|
|
sort each group by creationTime,
|
|
and keep only the top (newest) element.
|
|
|
|
- Drop rows that don't have the
|
|
| cliVersion |
|
|
| creationTime |
|
|
| language |
|
|
| sha |
|
|
columns. There are very few (16 out of 6000 on recent tests) and their DBs
|
|
are quesionable.
|
|
|
|
"""
|
|
import argparse
|
|
import logging
|
|
|
|
#
|
|
#* Configure logger
|
|
#
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
|
|
# Overwrite log level set by minio
|
|
root_logger = logging.getLogger()
|
|
root_logger.setLevel(logging.INFO)
|
|
|
|
#
|
|
#* Process command line
|
|
#
|
|
parser = argparse.ArgumentParser(
|
|
description=""" Read a table of CodeQL DB information,
|
|
group entries by (owner,name), sort each group by
|
|
creationTime and keep only the top (newest) element.
|
|
""")
|
|
|
|
args = parser.parse_args()
|
|
#
|
|
#* Collect the information and select subset
|
|
#
|
|
import pandas as pd
|
|
import sys
|
|
import qldbtools.utils as utils
|
|
|
|
df2 = pd.read_csv(sys.stdin)
|
|
|
|
#
|
|
#* Add single uniqueness field -- CID (Cumulative ID)
|
|
#
|
|
df2['CID'] = df2.apply(lambda row:
|
|
utils.cid_hash((
|
|
row['cliVersion'],
|
|
row['creationTime'],
|
|
row['language'],
|
|
row['sha'],
|
|
)), axis=1)
|
|
|
|
#
|
|
#* Re-order the dataframe columns by importance
|
|
# - Much of the data
|
|
# 1. Is only conditionally present
|
|
# 2. Is extra info, not for the DB proper
|
|
# 3. May have various names
|
|
#
|
|
# - The essential columns are
|
|
# | owner |
|
|
# | name |
|
|
# | language |
|
|
# | size |
|
|
# | cliVersion |
|
|
# | creationTime |
|
|
# | sha |
|
|
# | baselineLinesOfCode |
|
|
# | path |
|
|
#
|
|
# - The rest are useful; put them last
|
|
# | db_lang |
|
|
# | db_lang_displayName |
|
|
# | db_lang_file_count |
|
|
# | db_lang_linesOfCode |
|
|
# | left_index |
|
|
# | ctime |
|
|
# | primaryLanguage |
|
|
# | finalised |
|
|
|
|
df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
|
|
'language', 'sha','CID',
|
|
'baselineLinesOfCode', 'path', 'db_lang',
|
|
'db_lang_displayName', 'db_lang_file_count',
|
|
'db_lang_linesOfCode', 'ctime',
|
|
'primaryLanguage', 'finalised', 'left_index',
|
|
'size'])
|
|
|
|
# Identify rows missing specific entries
|
|
rows = ( df3['cliVersion'].isna() |
|
|
df3['creationTime'].isna() |
|
|
df3['language'].isna() |
|
|
df3['sha'].isna() )
|
|
df4 = df3[~rows]
|
|
|
|
# Sort and group
|
|
df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
|
|
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
|
|
|
|
# Write output
|
|
df_unique.to_csv(sys.stdout, index=False)
|
|
|
|
# Local Variables:
|
|
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
|
|
# End:
|