diff --git a/client/qldbtools/README.md b/client/qldbtools/README.md index 6a7f70f..c923671 100644 --- a/client/qldbtools/README.md +++ b/client/qldbtools/README.md @@ -84,4 +84,10 @@ import qldbtools as ql ./bin/mc-db-generate-selection -n 23 vscode-selection.json gh-mrva-selection.json < db-info-3.csv - +## Notes + + The preview-data plugin for VS Code has a bug; it displays `0` instead of + `0e3379` for the following. There are other entries with similar malfunction. + + CleverRaven,Cataclysm-DDA,0e3379,2.17.0,2024-05-08 12:13:10.038007+00:00,cpp,5ca7f4e59c2d7b0a93fb801a31138477f7b4a761,578098.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos-2024-04-29/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1228.0,578098.0,2024-05-13T12:14:54.650648,cpp,True,4245,563435469 + CleverRaven,Cataclysm-DDA,3231f7,2.18.0,2024-07-18 11:13:01.673231+00:00,cpp,db3435138781937e9e0e999abbaa53f1d3afb5b7,579532.0,/Users/hohn/work-gh/mrva/mrva-open-source-download/repos/CleverRaven/Cataclysm-DDA/code-scanning/codeql/databases/cpp/db.zip,cpp,C/C++,1239.0,579532.0,2024-07-24T02:33:23.900885,cpp,True,1245,573213726 diff --git a/client/qldbtools/bin/mc-db-refine-info b/client/qldbtools/bin/mc-db-refine-info index d5222fd..0202356 100755 --- a/client/qldbtools/bin/mc-db-refine-info +++ b/client/qldbtools/bin/mc-db-refine-info @@ -25,7 +25,7 @@ args = parser.parse_args() # #* Collect the information -# +# This step is time-intensive so we save the results right after. d = pd.read_csv(sys.stdin) joiners = [] for left_index in range(0, len(d)-1): @@ -43,51 +43,10 @@ for left_index in range(0, len(d)-1): joiners_df = pd.concat(joiners, axis=0) full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') -#** Add single uniqueness field -- CID (Cumulative ID) -full_df['CID'] = full_df.apply(lambda row: - utils.cid_hash(( - row['cliVersion'], - row['creationTime'], - row['language'], - row['sha'], - )), axis=1) - -#** Re-order the dataframe columns by importance -# - Much of the data -# 1. Is only conditionally present -# 2. Is extra info, not for the DB proper -# 3. May have various names -# -# - The essential columns are -# | owner | -# | name | -# | language | -# | size | -# | cliVersion | -# | creationTime | -# | sha | -# | baselineLinesOfCode | -# | path | -# -# - The rest are useful; put them last -# | db_lang | -# | db_lang_displayName | -# | db_lang_file_count | -# | db_lang_linesOfCode | -# | left_index | -# | ctime | -# | primaryLanguage | -# | finalised | - -final_df = full_df.reindex( columns=['owner', 'name', 'cliVersion', - 'creationTime', 'language', 'sha','CID', - 'baselineLinesOfCode', 'path', 'db_lang', - 'db_lang_displayName', 'db_lang_file_count', - 'db_lang_linesOfCode', 'ctime', - 'primaryLanguage', 'finalised', 'left_index', - 'size']) - -final_df.to_csv(sys.stdout, index=False) +# +#* Save results +# +full_df.to_csv(sys.stdout, index=False) # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" diff --git a/client/qldbtools/bin/mc-db-unique b/client/qldbtools/bin/mc-db-unique index fd6fb91..7b8d811 100755 --- a/client/qldbtools/bin/mc-db-unique +++ b/client/qldbtools/bin/mc-db-unique @@ -1,8 +1,20 @@ #!/usr/bin/env python -""" Read a table of CodeQL DB information, - group entries by (owner,name,CID), - sort each group by creationTime, - and keep only the top (newest) element. +""" Read a table of CodeQL DB information and produce a table with unique entries + adding the Cumulative ID (CID) column. + + To make this happen: + - Group entries by (owner,name,CID), + sort each group by creationTime, + and keep only the top (newest) element. + + - Drop rows that don't have the + | cliVersion | + | creationTime | + | language | + | sha | + columns. There are very few (16 out of 6000 on recent tests) and their DBs + are quesionable. + """ import argparse import logging @@ -30,15 +42,71 @@ args = parser.parse_args() # import pandas as pd import sys +import qldbtools.utils as utils -df0 = pd.read_csv(sys.stdin) +df2 = pd.read_csv(sys.stdin) -df_sorted = df0.sort_values(by=['owner', 'name', 'CID', 'creationTime']) +# +#* Add single uniqueness field -- CID (Cumulative ID) +# +df2['CID'] = df2.apply(lambda row: + utils.cid_hash(( + row['cliVersion'], + row['creationTime'], + row['language'], + row['sha'], + )), axis=1) + +# +#* Re-order the dataframe columns by importance +# - Much of the data +# 1. Is only conditionally present +# 2. Is extra info, not for the DB proper +# 3. May have various names +# +# - The essential columns are +# | owner | +# | name | +# | language | +# | size | +# | cliVersion | +# | creationTime | +# | sha | +# | baselineLinesOfCode | +# | path | +# +# - The rest are useful; put them last +# | db_lang | +# | db_lang_displayName | +# | db_lang_file_count | +# | db_lang_linesOfCode | +# | left_index | +# | ctime | +# | primaryLanguage | +# | finalised | + +df3 = df2.reindex( columns=['owner', 'name', 'cliVersion', 'creationTime', + 'language', 'sha','CID', + 'baselineLinesOfCode', 'path', 'db_lang', + 'db_lang_displayName', 'db_lang_file_count', + 'db_lang_linesOfCode', 'ctime', + 'primaryLanguage', 'finalised', 'left_index', + 'size']) + +# Identify rows missing specific entries +rows = ( df3['cliVersion'].isna() | + df3['creationTime'].isna() | + df3['language'].isna() | + df3['sha'].isna() ) +df4 = df3[~rows] + +# Sort and group +df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime']) df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() +# Write output df_unique.to_csv(sys.stdout, index=False) - # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # End: diff --git a/client/qldbtools/qldbtools/session-4-unique.py b/client/qldbtools/qldbtools/session-4-unique.py index a95cd1b..ae35e73 100644 --- a/client/qldbtools/qldbtools/session-4-unique.py +++ b/client/qldbtools/qldbtools/session-4-unique.py @@ -1,13 +1,38 @@ -# Experimental work with utils.py, to be merged into it. -from utils import * +# Experimental work for ../bin/mc-db-unique, to be merged into it. +import qldbtools.utils as utils from pprint import pprint +import pandas as pd +# cd ../ #* Reload CSV file to continue work -df2 = pd.read_csv('db-info-2.csv') +df2 = df_refined = pd.read_csv('db-info-2.csv') +# Identify rows missing specific entries +rows = ( df2['cliVersion'].isna() | + df2['creationTime'].isna() | + df2['language'].isna() | + df2['sha'].isna() ) +df2[rows] +df3 = df2[~rows] +df3 + +#* post-save work +df4 = pd.read_csv('db-info-3.csv') + +# Sort and group +df_sorted = df4.sort_values(by=['owner', 'name', 'CID', 'creationTime']) +df_unique = df_sorted.groupby(['owner', 'name', 'CID']).first().reset_index() + +# Find duplicates +df_dups = df_unique[df_unique['CID'].duplicated(keep=False)] +len(df_dups) +df_dups['CID'] + +# Set display options +pd.set_option('display.max_colwidth', None) +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 140) -df_sorted = df2.sort_values(by=['owner', 'name', 'creationTime']) -df_unique = df_sorted.groupby(['owner', 'name']).first().reset_index() # # Local Variables: