# Experimental work be merged with bin/mc-db-refine-info from utils import * from pprint import pprint #* Reload gzipped CSV file to continue work dbdf_1 = pd.read_csv('dbdf.csv.gz', compression='gzip') # # (old) Consistency check: # dbdf_1.columns == dbdf.columns # dbmask = (dbdf_1 != dbdf) # dbdf_1[dbmask] # dbdf_1[dbmask].dropna(how='all') # ctime_raw is different in places, so don't use it. # #* Interact with/visualize the dataframe # Using pandasgui -- qt from pandasgui import show os.environ['APPDATA'] = "needed-for-pandasgui" show(dbdf_1) # Using dtale -- web import dtale dtale.show(dbdf_1) # # #* Collect metadata from DB zip files # #** A manual sample # d = dbdf_1 left_index = 0 d.path[0] cqlc, metac = extract_metadata(d.path[0]) cqlc['baselineLinesOfCode'] cqlc['primaryLanguage'] cqlc['creationMetadata']['sha'] cqlc['creationMetadata']['cliVersion'] cqlc['creationMetadata']['creationTime'].isoformat() cqlc['finalised'] for lang, lang_cont in metac['languages'].items(): print(lang) indent = " " for prop, val in lang_cont.items(): if prop == 'files': print("%sfiles count %d" % (indent, len(val))) elif prop == 'linesOfCode': print("%slinesOfCode %d" % (indent, val)) elif prop == 'displayName': print("%sdisplayName %s" % (indent, val)) #** Automated for all entries # The rest of this interactive script is available as cli script in # mc-db-refine-info d = dbdf_1 joiners = [] for left_index in range(0, len(d)-1): try: cqlc, metac = extract_metadata(d.path[left_index]) except ExtractNotZipfile: continue except ExtractNoCQLDB: continue try: detail_df = metadata_details(left_index, cqlc, metac) except DetailsMissing: continue joiners.append(detail_df) joiners_df = pd.concat(joiners, axis=0) full_df = pd.merge(d, joiners_df, left_index=True, right_on='left_index', how='outer') #** View the full dataframe with metadata from pandasgui import show os.environ['APPDATA'] = "needed-for-pandasgui" show(full_df) #** Re-order the dataframe columns by importance # - Much of the data # 1. Is only conditionally present # 2. Is extra info, not for the DB proper # 3. May have various names # - The essential columns are # | owner | # | name | # | language | # | size | # | cliVersion | # | creationTime | # | sha | # | baselineLinesOfCode | # | path | # - The rest are useful; put them last # | db_lang | # | db_lang_displayName | # | db_lang_file_count | # | db_lang_linesOfCode | # | left_index | # | ctime | # | primaryLanguage | # | finalised | final_df = full_df.reindex(columns=['owner', 'name', 'language', 'size', 'cliVersion', 'creationTime', 'sha', 'baselineLinesOfCode', 'path', 'db_lang', 'db_lang_displayName', 'db_lang_file_count', 'db_lang_linesOfCode', 'ctime', 'primaryLanguage', 'finalised', 'left_index']) final_df.to_csv('all-info-table.csv.gz', compression='gzip', index=False) # # Local Variables: # python-shell-virtualenv-root: "~/work-gh/mrva/mrvacommander/client/qldbtools/venv/" # End: #