From b7b4839fe0760287b80b8e2887c29d736c5cae33 Mon Sep 17 00:00:00 2001
From: Michael Hohn <hohn@github.com>
Date: Thu, 1 Aug 2024 11:09:04 -0700
Subject: [PATCH] Enforce CID uniqueness and save raw refined info immediately

Previously, the refined info was collected and the CID computed before saving.
This was a major development time sink, so the CID is now computed in the
following step (bin/mc-db-unique).

The columns previously chosen for the CID are not enough.  If these columns are
empty for any reason, the CID repeats.  Just including the owner/name won't help,
because those are duplicates.

Some possibilities considered and rejected:
1. Could use a random number for missing columns.  But this makes
   the CID nondeterministic.
2. Switch to the file system ctime?  Not unique across owner/repo pairs,
   but unique within one.  Also, this could be changed externally and cause
   *very* subtle bugs.
3. Use the file system path?  It has to be unique at ingestion time, but
   repo collections can move.

Instead, this patch
4. Drops rows that don't have the
   | cliVersion   |
   | creationTime |
   | language     |
   | sha          |
   columns.  There are very few (16 out of 6000) and their DBs are
   quesionable.
---
 client/qldbtools/README.md                    |  8 +-
 client/qldbtools/bin/mc-db-refine-info        | 51 ++----------
 client/qldbtools/bin/mc-db-unique             | 82 +++++++++++++++++--
 .../qldbtools/qldbtools/session-4-unique.py   | 35 ++++++--
 4 files changed, 117 insertions(+), 59 deletions(-)

diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md
index 6a7f70f..c923671 100644
--- a/client/qldbtools/README.md
+++ b/client/qldbtools/README.md
@@ -84,4 +84,10 @@ import qldbtools as ql
         ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv 
        
        
-               
+## Notes
+
+   The preview-data plugin for VS Code has a bug; it displays `0` instead of
+   `0e3379` for the following.  There are other entries with similar malfunction.
+   
+        CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469
+        CleverRaven,Cataclysm-DDA,3231f7,2.18.0,2024-07-18 11:13:01.673231+00:00,cpp,db3435138781937e9e0e999abbaa53f1d3afb5b7,579532.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1239.0,579532.0,2024-07-24T02:33:23.900885,cpp,True,1245,573213726
diff --git a/client/qldbtools/bin/mc-db-refine-info b/client/qldbtools/bin/mc-db-refine-info
index d5222fd..0202356 100755
--- a/client/qldbtools/bin/mc-db-refine-info
+++ b/client/qldbtools/bin/mc-db-refine-info
@@ -25,7 +25,7 @@ args = parser.parse_args()
 
 #
 #* Collect the information
-#
+# This step is time-intensive so we save the results right after.
 d = pd.read_csv(sys.stdin)
 joiners = []
 for left_index in range(0, len(d)-1):
@@ -43,51 +43,10 @@ for left_index in range(0, len(d)-1):
 joiners_df = pd.concat(joiners, axis=0)
 full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer')    
 
-#** Add single uniqueness field -- CID (Cumulative ID)
-full_df['CID'] = full_df.apply(lambda row: 
-                               utils.cid_hash((
-                                   row['cliVersion'], 
-                                   row['creationTime'],
-                                   row['language'],
-                                   row['sha'], 
-                               )), axis=1)
-
-#** Re-order the dataframe columns by importance
-# - Much of the data
-#   1. Is only conditionally present
-#   2. Is extra info, not for the DB proper
-#   3. May have various names
-# 
-# - The essential columns are
-#     | owner               |
-#     | name                |
-#     | language            |
-#     | size                |
-#     | cliVersion          |
-#     | creationTime        |
-#     | sha                 |
-#     | baselineLinesOfCode |
-#     | path                |
-# 
-# - The rest are useful; put them last
-#     | db_lang             |
-#     | db_lang_displayName |
-#     | db_lang_file_count  |
-#     | db_lang_linesOfCode |
-#     | left_index          |
-#     | ctime               |
-#     | primaryLanguage     |
-#     | finalised           |
-
-final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion',
-                                     'creationTime', 'language', 'sha','CID',
-                                     'baselineLinesOfCode', 'path', 'db_lang',
-                                     'db_lang_displayName', 'db_lang_file_count',
-                                     'db_lang_linesOfCode', 'ctime',
-                                     'primaryLanguage', 'finalised', 'left_index',
-                                     'size'])
-
-final_df.to_csv(sys.stdout, index=False)
+#
+#* Save results
+#
+full_df.to_csv(sys.stdout, index=False)
 
 # Local Variables:
 # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
diff --git a/client/qldbtools/bin/mc-db-unique b/client/qldbtools/bin/mc-db-unique
index fd6fb91..7b8d811 100755
--- a/client/qldbtools/bin/mc-db-unique
+++ b/client/qldbtools/bin/mc-db-unique
@@ -1,8 +1,20 @@
 #!/usr/bin/env python
-""" Read a table of CodeQL DB information, 
-    group entries by (owner,name,CID),  
-    sort each group by creationTime,
-    and keep only the top (newest) element.
+""" Read a table of CodeQL DB information and produce a table with unique entries 
+    adding the Cumulative ID (CID) column.
+
+    To make this happen:
+    - Group entries by (owner,name,CID),  
+      sort each group by creationTime,
+      and keep only the top (newest) element.
+
+    - Drop rows that don't have the
+          | cliVersion   |
+          | creationTime |
+          | language     |
+          | sha          |
+      columns.  There are very few (16 out of 6000 on recent tests) and their DBs
+      are quesionable. 
+
 """
 import argparse
 import logging
@@ -30,15 +42,71 @@ args = parser.parse_args()
 #
 import pandas as pd
 import sys
+import qldbtools.utils as utils
 
-df0 = pd.read_csv(sys.stdin)
+df2 = pd.read_csv(sys.stdin)
 
-df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
+# 
+#* Add single uniqueness field -- CID (Cumulative ID)
+# 
+df2['CID'] = df2.apply(lambda row: 
+                       utils.cid_hash((
+                           row['cliVersion'], 
+                           row['creationTime'],
+                           row['language'],
+                           row['sha'], 
+                       )), axis=1)
+
+# 
+#* Re-order the dataframe columns by importance
+# - Much of the data
+#   1. Is only conditionally present
+#   2. Is extra info, not for the DB proper
+#   3. May have various names
+# 
+# - The essential columns are
+#     | owner               |
+#     | name                |
+#     | language            |
+#     | size                |
+#     | cliVersion          |
+#     | creationTime        |
+#     | sha                 |
+#     | baselineLinesOfCode |
+#     | path                |
+# 
+# - The rest are useful; put them last
+#     | db_lang             |
+#     | db_lang_displayName |
+#     | db_lang_file_count  |
+#     | db_lang_linesOfCode |
+#     | left_index          |
+#     | ctime               |
+#     | primaryLanguage     |
+#     | finalised           |
+
+df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime',
+                                     'language', 'sha','CID',
+                                     'baselineLinesOfCode', 'path', 'db_lang',
+                                     'db_lang_displayName', 'db_lang_file_count',
+                                     'db_lang_linesOfCode', 'ctime',
+                                     'primaryLanguage', 'finalised', 'left_index',
+                                     'size'])
+
+# Identify rows missing specific entries
+rows = ( df3['cliVersion'].isna() | 
+         df3['creationTime'].isna() |
+         df3['language'].isna() |
+         df3['sha'].isna() )
+df4 = df3[~rows]
+
+# Sort and group
+df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
 df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
 
+# Write output
 df_unique.to_csv(sys.stdout, index=False)
 
-
 # Local Variables:
 # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/"
 # End:
diff --git a/client/qldbtools/qldbtools/session-4-unique.py b/client/qldbtools/qldbtools/session-4-unique.py
index a95cd1b..ae35e73 100644
--- a/client/qldbtools/qldbtools/session-4-unique.py
+++ b/client/qldbtools/qldbtools/session-4-unique.py
@@ -1,13 +1,38 @@
-# Experimental work with utils.py, to be merged into it.
-from utils import *
+# Experimental work for ../bin/mc-db-unique, to be merged into it.
+import qldbtools.utils as utils
 from pprint import pprint
+import pandas as pd
+# cd ../
 
 #* Reload CSV file to continue work
-df2 = pd.read_csv('db-info-2.csv')
+df2 = df_refined = pd.read_csv('db-info-2.csv')
 
+# Identify rows missing specific entries
+rows = ( df2['cliVersion'].isna() | 
+         df2['creationTime'].isna() |
+         df2['language'].isna() |
+         df2['sha'].isna() )
+df2[rows]
+df3 = df2[~rows]
+df3
+
+#* post-save work
+df4 = pd.read_csv('db-info-3.csv')
+
+# Sort and group
+df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime'])
+df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index()
+
+# Find duplicates
+df_dups = df_unique[df_unique['CID'].duplicated(keep=False)]
+len(df_dups)
+df_dups['CID']
+
+# Set display options
+pd.set_option('display.max_colwidth', None)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 140)
 
-df_sorted = df2.sort_values(by=['owner', 'name', 'creationTime'])
-df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index()
 
 # 
 # Local Variables: