Sort Keys for Master List

#!/usr/bin/gawk -f # # Use: dtsort [-v tail=filename1.html] # [-v keys=filename2.html] # [-v sort={protoname|topzone|name}] # sorted_file.html # # Program dtsort sorts

-style resource records into "JANET" or # "protocol"order. It also applies certain transformation rules, such # as changing all SGML/HTML codes to uppercase and suppressing # trailing blanks. In addition, it deletes exact duplicate resource # records, and concatenates similar records by using

.. to # indent. # # Optional file 'tail' is nserted into the 'sorted_file' just before # the final .

lines can be appended with this # technique. Optional file 'keys' will contain a listing of the sort # keys (the JANET-style values). # # D.Wells, NRAO-CV # 1994-02-11: start development # 1994-03-10: initial version # 1994-07-??: mods # 1997-12-04: /usr/ucb whoami, fix 'TOO MANY', changed date/time stamp # 1998-10-23: change for Linux # # Todo: # *change news:/janet logic so news will sort properly. # -------------------------------------------------------------------- BEGIN { tail_file = tail; if (sort == "") sort = "topzone"; # to_lower(""); # Copy input header lines up to first blank, then append comment: while (getline > 0) if (NF > 0) print; else break; command = "/bin/date -u +%Y-%m-%dT%k:%M:%SZ"; command | getline thisdate; close(command); command = "echo \"" sprintf("%12s","'"sort"'") " sort ("\ "`/usr/bin/whoami`@`hostname`.`domainname` " thisdate ")\""; command | getline string; close(command); printf("\n", string); # print comment to stdout print string | "cat 1>&2"; # also to stderr (shows on terminal) printf("\n

\n"); # if (kf = (keys != "")) { key_file = keys; print "" >key_file; print "Sort Keys for Master List" >>key_file; printf("\n", string) >>key_file; print "" >>key_file; printf("Sort-keys are intended to be in '%s' order:

\n", sort) >>key_file; print "

. # Therefore, we skip all input lines until we get a left-justified
, # then we concatenate the resource record lines, up to the next blank line: $0 ~ /^[<][Dd][Tt][>]/ { r = $0 "||"; while (getline) if (NF > 0) r = r $0 "||"; else break; # The lines of this resource are now concatenated into one string, # separated by the code '||': n++; resource[n] = r; } # -------------------------------------------------------------------- # When reach EOF, edit each resource into canonical form, use keys[] to # sort the resource records, then delete duplicates and merge similar # resources, and finally print the resources to stdout: END { printf ("%8d resource records read.\n", n) | "cat 1>&2"; # Edit resource records into canonical form: num_edit = 0; for (l = 1; l <= n; l++) { flag = 0; replace_pattern(" +\\|\\|", "||"); replace_pattern("\\|\\| +", "||"); # temporary rule? replace_pattern("\"file://","\"ftp://"); replace_pattern(" + -->", " -->"); replace_pattern("\\|\\|")) { # Test for special "???" (somebody) case: if ((substr(resource[a[l]], 1, RSTART-1) \ substr(resource[a[l]], RSTART+RLENGTH)) \ == resource[a[l+1]]) { resource[a[l+1]] = resource[a[l]]; num_merge++; num_somebody++; somebody_flag = 1; } } if (!somebody_flag) { # append resource[a[l+1]] to [l], makes indentation: if (match(resource[a[l]], " \\|\\|$")) { resource[a[l+1]] = substr(resource[a[l]], 1, RSTART-1) \ resource[a[l+1]] "

||"; num_multiple++; } else { resource[a[l+1]] = resource[a[l]] \ "

The following resources are similar "\ "(same sort-key, different text):||" \ resource[a[l+1]] "||"; } num_merge++; } } else a[++i] = a[l]; } a[++i] = a[n]; n = i; if (num_merge > 0) printf ("%8d similar resource records merged. (n=%d) %d multiple, %d somebody\n", num_merge, n, num_multiple, num_somebody) | "cat 1>&2"; # # following table gives comment strings for selected top-level domains: tld["ar"] = "Argentina"; tld["at"] = "Austria"; tld["au"] = "Australia"; tld["be"] = "Belgium"; tld["bg"] = "Bulgaria"; tld["br"] = "Brazil"; tld["ca"] = "Canada"; tld["ch"] = "Switzerland"; tld["cl"] = "Chile"; tld["co"] = "Colombia"; tld["com"] = "US Commercial"; tld["cz"] = "Czech Republic"; tld["de"] = "Germany"; tld["dk"] = "Denmark"; tld["edu"] = "US Education & Research"; tld["ee"] = "Estonia"; tld["es"] = "Spain"; tld["fi"] = "Finland"; tld["fr"] = "France"; tld["gov"] = "US Government Agencies"; tld["gr"] = "Greece"; tld["hu"] = "Hungary"; tld["ie"] = "Ireland"; tld["il"] = "Israel"; tld["in"] = "India"; tld["it"] = "Italy"; tld["lv"] = "Latvia"; tld["jp"] = "Japan"; tld["mil"] = "US Military Organizations"; tld["ml"] = "Malta"; tld["mx"] = "Mexico"; tld["net"] = "Network Service Organizations"; tld["news"] = "USEnet Newsgroups"; tld["nl"] = "Netherlands"; tld["no"] = "Norway"; tld["nz"] = "New Zealand"; tld["org"] = "Various Organizations"; tld["pl"] = "Poland"; tld["pt"] = "Portugal"; tld["ro"] = "Romania"; tld["ru"] = "Russia"; tld["se"] = "Sweden"; tld["sk"] = "Slovak Republic"; tld["su"] = "Russia [Former Soviet Union]"; tld["tw"] = "Taiwan"; tld["uk"] = "United Kingdom [Great Britain]"; tld["us"] = "United States"; tld["uy"] = "Uraguay"; tld["yu"] = "Yugoslavia"; tld["za"] = "South Africa"; # tld["1_http"] = "World Wide Web"; tld["2_gopher"] = "Gopher"; tld["3_wais"] = "Wide Area Information Services"; tld["4_rlogin"] = \ tld["4_telnet"] = "Remote Login"; tld["5_file"] = "Anonymous-FTP [File Transfer Protocol]"; tld["6_news"] = "USEnet_Newsgroups"; tld["7_mailto"] = "Email exploders"; # # print the sorted resource records: last_top = ""; for (l = 1; l <= n; l++) { this_key = keys[a[l]]; top = substr(this_key, 1, index(this_key, ".")-1); if (top != last_top) { topcom = (top in tld) ? topcom = tld[top] : ""; text = "2"; text = "\n

" top \ " (" topcom ")

\n\n%s\n

"; if (tail_file != "") { num_tail = 0; while (getline 0) { print; num_tail++; } close(tail_file); # printf ("%4d lines copied from %s.\n", num_tail, tail_file) | "cat 1>&2"; } print ""; print ""; printf ("%8d resource records written.\n", n) | "cat 1>&2"; # if (kf) { print "" >>key_file; close(key_file); } } # ------------------------------------------------------------------ # Replace 'pattern' in resource[l] with 'string' and set 'flag': function replace_pattern(pattern, string) { num_hits = 0; while (match(resource[l], pattern)) { matched = substr(resource[l], RSTART, RLENGTH); hit[matched]++; num_hits++; if (num_hits > 40) { printf( \ "TOO MANY HITS? l=%3d,RS=%3d,RL=%2d,pat=|>%s<| matches |>%s<|\n", l, RSTART, RLENGTH, pattern, matched) | "cat 1>&2"; printf("resource[%d]=%s\n", l, substr(resource[l],1,60))|"cat 1>&2"; # exit(13); } hit[substr(resource[l], RSTART, RLENGTH)]++; resource[l] = \ substr(resource[l], 1, RSTART-1) \ string \ substr(resource[l], RSTART+RLENGTH); flag = 1; } } # ------------------------------------------------------------------ function to_lower(string) { if (string == "") { lc["A"] = "a"; lc["B"] = "b"; lc["C"] = "c"; lc["D"] = "d"; lc["E"] = "e"; lc["F"] = "f"; lc["G"] = "g"; lc["H"] = "h"; lc["I"] = "i"; lc["J"] = "j"; lc["K"] = "k"; lc["L"] = "l"; lc["M"] = "m"; lc["N"] = "n"; lc["O"] = "o"; lc["P"] = "p"; lc["Q"] = "q"; lc["R"] = "r"; lc["S"] = "s"; lc["T"] = "t"; lc["U"] = "u"; lc["V"] = "v"; lc["W"] = "w"; lc["X"] = "x"; lc["Y"] = "y"; lc["Z"] = "z"; } else { lower_string = ""; for (lci = 1; lci <= length(string); lci++) { cc = substr(string,lci,1); if (cc in lc) cc = lc[cc]; lower_string = lower_string cc; } } return(lower_string); }