diff --git a/client/qldbtools/qldbtools/session-generate-selection.py b/client/qldbtools/session/db-generate-selection.py similarity index 100% rename from client/qldbtools/qldbtools/session-generate-selection.py rename to client/qldbtools/session/db-generate-selection.py diff --git a/client/qldbtools/qldbtools/session1.py b/client/qldbtools/session/db-initial-info.py similarity index 100% rename from client/qldbtools/qldbtools/session1.py rename to client/qldbtools/session/db-initial-info.py diff --git a/client/qldbtools/qldbtools/session-populate-minio.py b/client/qldbtools/session/db-populate-minio.py similarity index 100% rename from client/qldbtools/qldbtools/session-populate-minio.py rename to client/qldbtools/session/db-populate-minio.py diff --git a/client/qldbtools/qldbtools/session-post-refine-info.py b/client/qldbtools/session/db-post-refine-info.py similarity index 97% rename from client/qldbtools/qldbtools/session-post-refine-info.py rename to client/qldbtools/session/db-post-refine-info.py index 4825678..e6e0728 100644 --- a/client/qldbtools/qldbtools/session-post-refine-info.py +++ b/client/qldbtools/session/db-post-refine-info.py @@ -1,3 +1,4 @@ +# Session around bin/mc-db-unique import qldbtools.utils as utils import pandas as pd diff --git a/client/qldbtools/qldbtools/session2.py b/client/qldbtools/session/db-refine-info.py similarity index 98% rename from client/qldbtools/qldbtools/session2.py rename to client/qldbtools/session/db-refine-info.py index 978da84..7e9e236 100644 --- a/client/qldbtools/qldbtools/session2.py +++ b/client/qldbtools/session/db-refine-info.py @@ -1,4 +1,4 @@ -# Experimental work with utils.py, to be merged into it. +# Experimental work be merged with bin/mc-db-refine-info from utils import * from pprint import pprint diff --git a/client/qldbtools/qldbtools/session-4-unique.py b/client/qldbtools/session/db-unique-1.py similarity index 100% rename from client/qldbtools/qldbtools/session-4-unique.py rename to client/qldbtools/session/db-unique-1.py diff --git a/client/qldbtools/session/db-unique.py b/client/qldbtools/session/db-unique.py new file mode 100644 index 0000000..e6e0728 --- /dev/null +++ b/client/qldbtools/session/db-unique.py @@ -0,0 +1,46 @@ +# Session around bin/mc-db-unique +import qldbtools.utils as utils +import pandas as pd + +# +#* Collect the information +# +df1 = pd.read_csv("scratch/db-info-2.csv") + +# Add single uniqueness field -- CID (Cumulative ID) -- using +# - creationTime +# - sha +# - cliVersion +# - language + +from hashlib import blake2b + +def cid_hash(row_tuple: tuple): + """ + cid_hash(row_tuple) + Take a bytes object and return hash as hex string + """ + h = blake2b(digest_size = 3) + h.update(str(row_tuple).encode()) + # return int.from_bytes(h.digest(), byteorder='big') + return h.hexdigest() + +# Apply the cid_hash function to the specified columns and create the 'CID' column +df1['CID'] = df1.apply(lambda row: cid_hash( (row['creationTime'], + row['sha'], + row['cliVersion'], + row['language']) + ), axis=1) + +df2 = df1.reindex(columns=['owner', 'name', 'cliVersion', 'creationTime', + 'language', 'sha','CID', 'baselineLinesOfCode', 'path', + 'db_lang', 'db_lang_displayName', 'db_lang_file_count', + 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', + 'finalised', 'left_index', 'size']) + +df1['cid'] + + +# Local Variables: +# python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" +# End: