#!/bin/bash
rootdir=/nird/datapeak/NS2345K ## datapeak of NS2345K
checkdirs="0FC4DD98-2706-4168-8381-5121DAFDFE11.tar.gz ARCHIVE_STAGEDIR AerChemMIP BCM CAM-Oslo CLM5_N1850_CTRL_f19_tn14_tag008 CLM5_N1850_OPT4_f19_tn14_tag008 CLM5_N1850_OPT5_f19_tn14_tag008 CLM5_N1850_OPT6_f19_tn14_tag008 CMIP6-emis CMIP6_PAMIP CTRL2_19 DO_NOT_TOUCH ERA_plot ESMVALTOOL ESMValCore ESMValTool_new F1850C5_f09_f09_LGM_test1 FC5_f19_f19_0302 FC60_f19_1811 FRAM-WWW I1850Clm50BgcCropCPLHIST_pADspinup_f19_tn14_160619 I1850Clm50BgcCropCPLHIST_spinup_N1850OCBDRDDMS_f19_tn14_250119 I1850Clm50BgcCropCPLHIST_spinup_f19_tn14_060619 I1850Clm50BgcCropCPLHIST_spinup_f19_tn14_finit2deg1240_070619 I1850Clm50BgcCropSpinup_f19_tn14_160619_CPLHIST MICOM_grid_files N1850AERCN_f09_g16_01 N1850AERCN_f09_g16_modice N1850AERCN_f09_g16_spinupre N1850AERCN_f09_g16_thincloud N1850_1312_f19 NORSTORE_OSL_DISK NORSTORE_OSL_TAPE NO_BACKUP NO_BACKUP.README.txt TMP_IHHK_UPLOAD_DATAARCHIVE aertests alok alok_esmvaltool altRHpiclimctrl amwg_diag_alok anu074 cases cgu025 cmip5 cmip6_omip crf dataporten-home diagnostics esg esmvaltool_Nird esp f19_tn11_CLIMVOTE_34MA_20190715_v2 findit gauravm home hra063 ice2ice ihkarset ipcc jerry johiak local logs martinls mer_tr michaelsch mifajd migrate_NS9998K milicak miniconda3 mpet noresm noresm_diagnostics oddho ovewh oyvinds peter_siew projects run_ada shared shuang temp test_swift tmp users workshop2021 www yfan zhang"
filelist="NS2345K_datapeak_filelist_$(date +%Y%m%dT%H%M).txt"  # could be very large, maybe over 1.5GB
errfile="filelist_classfy.err.txt"  # usually output unreadable directories


# get file list if not present
if [ ! -f "${filelist}" ];then
    echo "Making file list: ${filelist}" 
    rm -f "${errfile}"
    for dir in ${checkdirs} ; do
        echo "caching ${rootdir}/${dir}"
        find "${rootdir}/${dir}" -type f -printf "%u  %s hlink=%n %h/%f\n" >> "${filelist}"  2>> "${errfile}"
        ##                                         user size nhardlink path/filename
    done
fi

# grep hard links
echo "number of hard links (duplicated file count): $(grep -c -v 'hlink=1' ${filelist})"

# accumulate total userage for each user
usageByUsers=$(cat "${filelist}" | awk '{user[$1]+=$2}; END{ for( i in user) print "u_"i " " user[i]}') ## user size_in_byte
#size does not append ##cat "${filelist}" | awk '{user[$1]+=$2}; END{ for( i in user) print i " " system("numfmt --to=iec " user[i] "| tr -d \"\n\"") }'

for i in  $usageByUsers; do 
    if [ ${i#u_} == ${i} ] ; then
        printf "%5.5s\n" $(echo ${i} | numfmt --to=iec)
    else
        printf "%10.10s  " ${i#u_}
    fi
done

# zip file list
gzip ${filelist}
