Enforce CID uniqueness and save raw refined info immediately

Previously, the refined info was collected and the CID computed before saving.
This was a major development time sink, so the CID is now computed in the
following step (bin/mc-db-unique).

The columns previously chosen for the CID are not enough.  If these columns are
empty for any reason, the CID repeats.  Just including the owner/name won't help,
because those are duplicates.

Some possibilities considered and rejected:
1. Could use a random number for missing columns.  But this makes
   the CID nondeterministic.
2. Switch to the file system ctime?  Not unique across owner/repo pairs,
   but unique within one.  Also, this could be changed externally and cause
   *very* subtle bugs.
3. Use the file system path?  It has to be unique at ingestion time, but
   repo collections can move.

Instead, this patch
4. Drops rows that don't have the
   | cliVersion   |
   | creationTime |
   | language     |
   | sha          |
   columns.  There are very few (16 out of 6000) and their DBs are
   quesionable.
This commit is contained in:
Michael Hohn
2024-08-01 11:09:04 -07:00
committed by =Michael Hohn
parent 06dcf50728
commit b7b4839fe0
4 changed files with 117 additions and 59 deletions

View File

@@ -84,4 +84,10 @@ import qldbtools as ql
./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv
## Notes
The preview-data plugin for VS Code has a bug; it displays `0` instead of
`0e3379` for the following. There are other entries with similar malfunction.
CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469
CleverRaven,Cataclysm-DDA,3231f7,2.18.0,2024-07-18 11:13:01.673231+00:00,cpp,db3435138781937e9e0e999abbaa53f1d3afb5b7,579532.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1239.0,579532.0,2024-07-24T02:33:23.900885,cpp,True,1245,573213726

View File

@@ -25,7 +25,7 @@ args = parser.parse_args()
# #
#* Collect the information #* Collect the information
# # This step is time-intensive so we save the results right after.
d = pd.read_csv(sys.stdin) d = pd.read_csv(sys.stdin)
joiners = [] joiners = []
for left_index in range(0, len(d)-1): for left_index in range(0, len(d)-1):
@@ -43,51 +43,10 @@ for left_index in range(0, len(d)-1):
joiners_df = pd.concat(joiners, axis=0) joiners_df = pd.concat(joiners, axis=0)
full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')
#** Add single uniqueness field -- CID (Cumulative ID) #
full_df['CID'] = full_df.apply(lambda row: #* Save results
utils.cid_hash(( #
row['cliVersion'], full_df.to_csv(sys.stdout, index=False)
row['creationTime'],
row['language'],
row['sha'],
)), axis=1)
#** Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
#
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
#
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
'creationTime', 'language', 'sha','CID',
'baselineLinesOfCode', 'path', 'db_lang',
'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime',
'primaryLanguage', 'finalised', 'left_index',
'size'])
final_df.to_csv(sys.stdout, index=False)
# Local Variables: # Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"

View File

@@ -1,8 +1,20 @@
#!/usr/bin/env python #!/usr/bin/env python
""" Read a table of CodeQL DB information, """ Read a table of CodeQL DB information and produce a table with unique entries
group entries by (owner,name,CID), adding the Cumulative ID (CID) column.
sort each group by creationTime,
and keep only the top (newest) element. To make this happen:
- Group entries by (owner,name,CID),
sort each group by creationTime,
and keep only the top (newest) element.
- Drop rows that don't have the
| cliVersion |
| creationTime |
| language |
| sha |
columns. There are very few (16 out of 6000 on recent tests) and their DBs
are quesionable.
""" """
import argparse import argparse
import logging import logging
@@ -30,15 +42,71 @@ args = parser.parse_args()
# #
import pandas as pd import pandas as pd
import sys import sys
import qldbtools.utils as utils
df0 = pd.read_csv(sys.stdin) df2 = pd.read_csv(sys.stdin)
df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime']) #
#* Add single uniqueness field -- CID (Cumulative ID)
#
df2['CID'] = df2.apply(lambda row:
utils.cid_hash((
row['cliVersion'],
row['creationTime'],
row['language'],
row['sha'],
)), axis=1)
#
#* Re-order the dataframe columns by importance
# - Much of the data
# 1. Is only conditionally present
# 2. Is extra info, not for the DB proper
# 3. May have various names
#
# - The essential columns are
# | owner |
# | name |
# | language |
# | size |
# | cliVersion |
# | creationTime |
# | sha |
# | baselineLinesOfCode |
# | path |
#
# - The rest are useful; put them last
# | db_lang |
# | db_lang_displayName |
# | db_lang_file_count |
# | db_lang_linesOfCode |
# | left_index |
# | ctime |
# | primaryLanguage |
# | finalised |
df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
'language', 'sha','CID',
'baselineLinesOfCode', 'path', 'db_lang',
'db_lang_displayName', 'db_lang_file_count',
'db_lang_linesOfCode', 'ctime',
'primaryLanguage', 'finalised', 'left_index',
'size'])
# Identify rows missing specific entries
rows = ( df3['cliVersion'].isna() |
df3['creationTime'].isna() |
df3['language'].isna() |
df3['sha'].isna() )
df4 = df3[~rows]
# Sort and group
df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
# Write output
df_unique.to_csv(sys.stdout, index=False) df_unique.to_csv(sys.stdout, index=False)
# Local Variables: # Local Variables:
# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
# End: # End:

View File

@@ -1,13 +1,38 @@
# Experimental work with utils.py, to be merged into it. # Experimental work for ../bin/mc-db-unique, to be merged into it.
from utils import * import qldbtools.utils as utils
from pprint import pprint from pprint import pprint
import pandas as pd
# cd ../
#* Reload CSV file to continue work #* Reload CSV file to continue work
df2 = pd.read_csv('db-info-2.csv') df2 = df_refined = pd.read_csv('db-info-2.csv')
# Identify rows missing specific entries
rows = ( df2['cliVersion'].isna() |
df2['creationTime'].isna() |
df2['language'].isna() |
df2['sha'].isna() )
df2[rows]
df3 = df2[~rows]
df3
#* post-save work
df4 = pd.read_csv('db-info-3.csv')
# Sort and group
df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
# Find duplicates
df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
len(df_dups)
df_dups['CID']
# Set display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 140)
df_sorted = df2.sort_values(by=['owner', 'name', 'creationTime'])
df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index()
# #
# Local Variables: # Local Variables: