#!/usr/bin/gawk -f # # Use: dtdiff new_file.html old_file.html > diff_file.html # # Program dtdiff compares two
-style resource record files, and # produces a 'diff' report in HTML format. The comparison is done in # memory, so it is not neccesary that either input file be sorted. # D.Wells, NRAO-CV, 13Feb94. # -------------------------------------------------------------------- END { for (i = 1; i <=2; i++) { # read "new_file", then "old_file": if((val = read_header(ARGV[i])) <= 0) { printf("ABORT: read_header(%s) returned %d.\n", ARGV[i], val); exit(13); } titles[i] = title; links[i] = link; j = 0; do { if ((val = read_record(ARGV[i])) <= 0) { if (val < 0) { printf("ABORT: read_record(%s) returned %d for j=%d.\n", ARGV[i], val, j); exit(13); } break; } j++; records[i,j] = rr; urls[i,j] = url; } while (1); num_recs[i] = j; close(ARGV[i]); # print "file=" ARGV[i]; print num_recs[i] " records"; print "=" titles[i]; print "<LINK>=" links[i]; print "URL[3]=" urls[i,3]; print "records[3]=" records[i,3]; print " "; } # # At this point we have the two sets of records and URLs in memory. # First we will compare the lists and delete identical records. # Then we will produce several listings: # # Delete identical (same URL, same text) resources: num_delete = 0; for (i = 1; i <= num_recs[1]; i++) { for (j = 1; j <= num_recs[2]; j++) { if ((i == 3) && (j ==3)) print urls[1,i], urls[2,j]; if (urls[1,i] == urls[2,j]) { print "equal: " i, j; for (m = i; m < num_records[1]; m++) { records[1,m] = records[1,m+1]; urls[1,m] = urls[1,m+1]; } num_records[1]--; for (m = j; m < num_records[2]; m++) { records[2,m] = records[2,m+1]; urls[2,m] = urls[2,m+1]; } num_records[2]--; num_delete++; } } } # # NOTE! above logic has subscript wrong by 1 as soon as delete is done. # printf ("%4d identical resource records deleted.\n", num_delete); exit(1); # Merge cases of identical URLs but different text: num_merge = 0; for (l = 1; l < n; l++) { if ((key[l] == key[l+1]) && (resource[l] != resource[l+1])) { # append resource[l+1] to [l] with <DD> to make indentation: resource[l] = resource[l] "<DD>||" resource[l+1] "</DD>||"; for (m = l+1; m < n; m++) { # move rest of list up one place: key[m] = key[m+1]; resource[m] = resource[m+1]; } # resource[l+1] has been deleted: num_merge++; n--; # decrements count of resource[]; } } printf ("%4d similar resource records merged. (n=%d)\n", num_merge, n) | "cat 1>&2"; printf ("%4d resource records edited:\n", num_edit) | "cat 1>&2"; for (i in hit) printf("\t%4d cases of |%s|\n", hit[i], i) | "cat 1>&2"; print "</DL></BODY></HTML>"; printf ("%4d resource records written.\n", n) | "cat 1>&2"; } # ------------------------------------------------------------------ function read_header(filename) { # This function reads the <HEAD> lines from a specified input # file, and returns the <TITLE> and <LINK> strings as global variables. # head = ""; do { if ((ret_val = getline <filename) <= 0) return (ret_val); if (NF > 0) head = head $0; else break; } while (1); u1 = match(head, "<[Tt][Ii][Tt][Ll][Ee]>") + RLENGTH; ul = match(substr(head, u1), "</[Tt][Ii][Tt][Ll][Ee]>") - 1; title = (u1 > 0) ? substr(head, u1, ul) : ""; u1 = match(head, "<[Ll][Ii][Nn][Kk] .* [Hh][Rr][Ee][Ff]=\"") + RLENGTH; ul = match(substr(head, u1), "\">") - 1; link = (u1 > 0) ? substr(head, u1, ul) : ""; return(1); } # ------------------------------------------------------------------ function read_record(filename) { # This function reads the next resource record from a specified input # file, and returns the URL and the text as global variables. # Canonical resource records begin with blank line followed by <DT>. # Therefore, we skip all input lines until we get a left-justified <DT>, # then we concatenate the resource record lines, up to the next blank # line: do { if ((ret_val = getline <filename) <= 0) return(ret_val); if ($0 ~ /^[<][Dd][Tt][>]/) break; # detect <DT> line } while (1); rr = $0 "||"; do { if ((ret_val = getline <filename) <= 0) return(ret_val); if (NF > 0) rr = rr $0 "||"; else break; } while (1); # The lines of this resource record are now concatenated into one string, # separated by the code '||'. Now we will extract the URL: u1 = match(rr, "[Hh][Rr][Ee][Ff]=\"") + RLENGTH; # find start of first URL ul = index(substr(rr, u1), "\"") - 1; # get its length url = (u1 > 0) ? substr(rr, u1, ul) : ""; return(1); } # ------------------------------------------------------------------ function print_record(resource, filename) { while (length(resource) > 0) { p = index(resource, "||"); # find end of current line, printf("%s\n", substr(resource, 1, p-1)) >filename; # print it, resource = substr(resource, p+2); # and delete it. } printf("\n"); }