Scripts

gremlins
gremfinderu
graphu
graphuNI
empties
fldnos
chkevoc
changes

gremlins

#!/bin/bash

echo "u000d|carriage return (CR, u000d, 0d)
u00a0|non-breaking space (NBSP, u00a0, c2 a0)
u00ad|soft hyphen (SHY, u00ad, c2 ad)
u200b|zero-width space (ZWSP, u200b, e2 80 8b)" > /tmp/basegrem

echo "u0001|start of heading (SOH, u0001, 01)
u0002|start of text (STX, u0002, 02)
u0003|end of text (ETX, u0003, 03)
u0004|end of transmission (EOT, u0004, 04)
u0005|enquiry (ENQ, u0005, 05)
u0006|acknowledge (ACK, u0006, 06)
u0007|bell (BEL, u0007, 07)
u0008|backspace (BS, u0008, 08)
u000b|vertical tab (VT, u000b, 0b)
u000c|form feed (FF, u000c, 0c)
u000e|shift out (SO, u000e, 0e)
u000f|shift in (SI, u000f, 0f)
u0010|data link escape (DLE, u0010, 10)
u0011|device control 1 (DC1, u0011, 11)
u0012|device control 2 (DC2, u0012, 12)
u0013|device control 3 (DC3, u0013, 13)
u0014|device control 4IDC4 (u0014, 14, )
u0015|negative acknowledge (NAK, u0015, 15)
u0016|synchronous idle (SYN, u0016, 16)
u0017|end of transmission block (ETB, u0017, 17)
u0018|cancel (CAN, u0018, 18)
u0019|end of medium (EM, u0019, 19)
u001a|substitute (SUB, u001a, 1a)
u001b|escape (ESC, u001b, 1b)
u001c|file separator (FS, u001c, 1c)
u001d|group separator (GS, u001d, 1d)
u001e|record separator (RS, u001e, 1e)
u001f|unit separator (US, u001f, 1f)
u007f|delete (DEL, u007f, 7f)
u0080|padding character (PAD, u0080, c2 80)
u0081|high octet preset (HOP, u0081, c2 81)
u0082|break permitted here (BPH, u0082, c2 82)
u0083|no break here (NBH, u0083, c2 83)
u0084|index (IND, u0084, c2 84)
u0085|next line (NEL, u0085, c2 85)
u0086|start of selected area (SSA, u0086, c2 86)
u0087|end of selected area (ESA, u0087, c2 87)
u0088|horizontal tab (HTS, u0088, c2 88)
u0089|horizontal tab with justification (HTJ, u0089, c2 89)
u008a|line tabulation set (VTS, u008a, c2 8a)
u008b|partial line down (PLD, u008b, c2 8b)
u008c|partial line up (PLC, u008c, c2 8c)
u008d|reverse index (RI, u008d, c2 8d)
u008e|single shift two (SS2, u008e, c2 8e)
u008f|single shift three (SS3, u008f, c2 8f)
u0090|device control string (DCS, u0090, c2 90)
u0091|private use one (PU1, u0091, c2 91)
u0092|private use two (PU2, u0092, c2 92)
u0093|set transmit state (STS, u0093, c2 93)
u0094|cancel character (CCH, u0094, c2 94)
u0095|message waiting (MW, u0095, c2 95)
u0096|start of protected area (SPA, u0096, c2 96)
u0097|end of protected area (EPA, u0097, c2 97)
u0098|start of string (SOS, u0098, c2 98)
u0099|single graphic character introducer (SGCI, u0099, c2 99)
u009a|single character introducer (SCI, u009a, c2 9a)
u009b|control sequence introducer (CSI, u009b, c2 9b)
u009c|string terminator (ST, u009c, c2 9c)
u009d|operating system command (OSC, u009d, c2 9d)
u009e|privacy message (PM, u009e, c2 9e)
u009f|application program command (APC, u009f, c2 9f)" > /tmp/othergrem

grep -P '\x{000d}|\x{00a0}|\x{00ad}|\x{200b}' "$1" > base
grep -P "[\x01-\x08\x0b\x0c\x0e-\x19\x1a-\x1f\x7f\x80-\x9f]" "$1" > other

while read line; do awk -v STR="$line" -v CHAR="$(printf "\\$line")" 'FNR==NR {a[$1]=$2; next} $0 ~ CHAR {cnt++; n+=gsub(CHAR,"")} END {print a[STR]": "(n==0 ? "\x1b[1;34mnone\x1b[0m" : "\x1b[1;34m"n" in "cnt" records\x1b[0m")}' FS="|" /tmp/basegrem base; done <<<"$(cut -f1 -d"|" /tmp/basegrem)"

printf "_ _ _ _ _ _ _ _ _ _ _ \n"
printf "\nChecking now for gremlin control characters, please wait..."
echo
awk '/\x00/ {cnt++; n+=gsub(/\x00/,"")} END {if (n>0) print "null (NUL, u0000, x00):\x1b[1;34m "n" in "cnt" records\x1b[0m"}' "$1" > /tmp/others

while read line; do awk -v STR="$line" -v CHAR="$(printf "\\$line")" 'FNR==NR {a[$1]=$2; next} $0 ~ CHAR {cnt++; n+=gsub(CHAR,"")} END {if (n>0) print a[STR]": \x1b[1;34m"n" in "cnt" records\x1b[0m"}' FS="|" /tmp/othergrem other; done <<<"$(cut -f1 -d"|" /tmp/othergrem)" >> /tmp/others
echo
if [ -s /tmp/others ]; then
cat /tmp/others
else
printf "No NULs or gremlin control characters found\n\n"
fi
echo
rm /tmp/basegrem /tmp/othergrem /tmp/others base other
exit 0

gremfinderu

#!/bin/bash

redden=$(printf "\033[31;1m")
reset=$(printf "\033[0m")
char=$(printf "\\$2")
awk -F"\t" -v grem="$char" '$0 ~ grem {for (i=1;i<=NF;i++) {if ($i ~ grem) {print NR FS i FS $i}}}' "$1" | sort -t $'\t' -nk2 -nk1 > "$2"-list-"$1"
echo
echo "Table \"$1\" has \"$2\"-containing words in the following field(s):"
cut -f2- "$2"-list-"$1" | sort | uniq -c | sed 's/[ ]*//;s/[ ]/\t/' | awk -F"\t" '{print "\tfield "$2" in "$1" records"}'
echo
read -p "Show uniquified results with less? (y/n)" foo
echo
if [ "$foo" == "n" ]; then
exit 0
else
cut -f2- "$2"-list-"$1" | sort -n | uniq | sed "s/$char/${redden}{HERE}${reset}/g" | less -RX
fi
exit 0

graphu

To run the script without pv, delete "pv -w 50 -pbt "$1" |" and add "$1" (with quotes) immediately after the AWK command.

#!/bin/bash

pv -w 50 -pbt "$1" \
| awk 'BEGIN {FS=""} {for (i=1;i<=NF;i++) if ($i ~ /[[:graph:]]/) {arr[$i]++}} END {for (j in arr) printf("%s\t%s\n",arr[j],j)}' \
| sort -t $'\t' -k2 \
| while read -r line; do printf "%s\t%s" "$line" "$(cut -f2 <<<"$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')"; echo; done \
| pr -t -3

exit

graphuNI

This is graphu without ideographs. To run the script without pv, delete "pv -w 50 -pbt "$1" |" and add "$1" (with quotes) immediately after the AWK command.

empties

To run without pv, delete "pv -w 50 -pbt "$1" | " and add "$1" (with quotes) immediately after the AWK command.

#!/bin/bash

pv -w 50 -pbt "$1" | awk -F"\t" 'NR>1 {for (i=1;i<=NF;i++) a[i]+=length($i)} END {for (j in a) {if (a[j]==0) print j}}' > /tmp/flds
head -n1 "$1" | tr '\t' '\n' | nl -w1 > /tmp/allflds
if [ ! -s /tmp/flds ]; then
echo
echo "No empty fields in $1" && rm /tmp/flds /tmp/allflds && exit
else
awk -F"\t" 'FNR==NR {b[$1]=$2; next} $1 in b {print $1":"b[$1]}' /tmp/allflds /tmp/flds > "$1"_emptyfields
echo
echo "$(wc -l < "$1"_emptyfields) empty field(s) in "$1" table"
echo
read -p "Build table with non-empty fields only? (y/n) " build
if [[ "$build" == "y" ]]; then
read -p "What name for the new table? " name
cut --complement -f"$(paste -d',' -s /tmp/flds)" "$1" > "$name"
else
rm /tmp/flds /tmp/allflds && exit
fi
fi
rm /tmp/flds /tmp/allflds
exit

fldnos

#!/bin/bash

blue="\033[1;34m"
reset="\033[0m"
var1=$(awk -F"\t" '{print NF}' "$1" | sort | uniq -c | sed 's/^[ ]*//;s/ /\t/' | sort -nr)
var2=$(echo "$var1" | wc -l)
if [ "$var2" -eq "1" ]; then
printf "All $blue$(echo "$var1" | cut -f1)$reset records in $blue$1$reset have $blue$(echo "$var1" | cut -f2)$reset fields\n"
else
printf "$blue$1$reset has:\n"
echo "$var1" | awk -F"\t" -v BLUE="$blue" -v RESET="$reset" '{print BLUE $1 RESET" records with "BLUE $2 RESET" fields"}'
echo
read -p "Show line numbers with wrong field totals? (y/n)" var3
case "$var3" in
n) exit 0;;
y) echo "$var1" | sort -nr | tail -n +2 | cut -f2> /tmp/wrongs
awk -F"\t" -v BLUE="$blue" -v RESET="$reset" 'FNR==NR {arr[$0]; next} (NF in arr) {print "line "BLUE FNR RESET": "BLUE NF RESET" fields"}' /tmp/wrongs "$1"
rm /tmp/wrongs;;
esac
fi
exit

chkevoc

#!/bin/bash

echo
awk -F"\t" 'NR==1 \
{for (i=1;i<=NF;i++) if ($i ~ /eventID/) pkev=i} \
NR>1 \
{if ($pkev !~ /[[:alnum:]]/) \
{mee=1; print "eventID missing on line "NR" of event.txt"} \
{pke[$pkev]++; pkedupe[$pkev][NR]}} \
END {PROCINFO["sorted_in"]="@ind_str_asc"; \
if (mee==0) print "No missing eventID in event.txt"; \
for (i in pke) {for (j in pkedupe[i]) {if (pke[i]>1) \
{dupev=1; print " duplicated eventID \"" i "\" on line " j " of event.txt"}}} \
if (dupev==0) print "No duplicate eventID in event.txt"}' \
event.txt
echo
awk -F"\t" 'NR==1 \
{for (r=1;r<=NF;r++) {if ($r ~ /eventID/) fkoc=r; \
else if ($r ~ /occurrenceID/) pkoc=r;}} \
NR>1 \
{if ($pkoc !~ /[[:alnum:]]/) \
{moo=1; print "occurrenceID missing on line "NR" of occurrence.txt"}} \
{if ($fkoc !~ /[[:alnum:]]/) \
{meo=1; print "eventID missing on line "NR" of occurrence.txt"}} \
{pko[$pkoc]++; pkodupe[$pkoc][NR]} \
END {PROCINFO["sorted_in"]="@ind_str_asc"; \
if (moo==0) print "No missing occurrenceID in occurrence.txt"; \
if (meo==0) print "No missing eventID in occurrence.txt"; \
for (m in pko) {for (n in pkodupe[m]) {if (pko[m]>1) \
{dupoc=1; print " duplicated occurrenceID \"" m "\" on line " n " of occurrence.txt"}}} \
if (dupoc==0) print "No duplicate occurrenceID in occurrence.txt"}' \
occurrence.txt
echo
awk -F"\t" 'ARGIND==1 && FNR==1 \
{for (s=1;s<=NF;s++) if ($s ~ /eventID/) pk=s} \
ARGIND==1 && FNR>1 \
{ev[$pk]} \
ARGIND==2 && FNR==1 \
{for (t=1;t<=NF;t++) if ($t ~ /eventID/) fk=t} \
ARGIND==2 && FNR>1 && !($fk in ev) \
{flag=1; print "eventID \""$fk"\" on line "FNR" of occurrence.txt is not in event.txt"} \
END {if (flag != 1) \
print "All eventID in occurrence.txt are in event.txt"}' \
event.txt occurrence.txt
echo
exit

changes

#!/bin/bash

paste "$1" "$2" > merged

FN=$(awk -F"\t" 'NR==1 {print NF/2; exit}' merged)

awk -F"\t" -v fn="$FN" -v idfld="$3" \
'NR==1 {for (i=1;i<=fn;i++) a[i]=$i; \
print $idfld FS "field" FS "original" FS "edited\n"} \
NR>1 {for (j=1;j<=fn;j++) if ($j != $(j+fn)) \
print $idfld FS a[j] FS $j FS $(j+fn)}' merged \
| sed 's/^$/---------------/;s/\t/ | /g' \
| tee "$2-edits-$(date +"%Y-%m-%dT%X").txt"

rm merged

exit 0