#!/usr/bin/perl # # Reads standard input, writes to standard output. # # The multi-pass structure is necessary for efficiency. If I do the # whole file in one read, some of the operations in the second pass # are pathetically slow. # # to do: # At the very end, after eating all tags, escape all < and >. # tabs => & ? # áè # $debug = 1; %greek = ( a => alpha, b => beta, g => gamma, G => Gamma, d => delta, D => Delta, e => epsilon, h => eta, q => theta, f => phi, j => phi, i => iota, k => kappa, l => lambda, m => mu, n => nu, p => pi, r => rho, t => tau, y => psi, Y => Psi, u => upsilon, U => Upsilon, z => zeta, x => xi, c => chi, w => omega, W => Omega, S => Sigma ); #==================================== first pass ============================================ undef $/; # Perl idiom so we read whole file all at once open (TEMPOUT,">temp001.tex") or die("Cannot open temp001.tex for output"); while (<>) { tr/\r/\n/; # ---------------- First, deal with stuff that would keep me from recognizing patterns later: # Strip out all newlines that occur inside of brackets. # They only seem to occur at the deepest nesting level, so I only # code for that possibility. for ($j=1; $j<=6; $j++) { s/<([^<>]*)\n([^<>]*)>/<$1$2>/g; } # Font size changes, like , are likely to be wrong in LaTeX. # Sometimes I used them to adjust size of superscripts and subscripts by hand. # They prevent me from recognizing certain patterns later on, too. # They must die. s///g; s/]*>//g; # Deal with stuff like this: f

l s/([\w \t\d]+)

/$1

/g; # Escape characters that have special significance in latex. Do this before # I start generating my own latex. s/\%/\\\%/g; s/\#/\\\#/g; s/\&/\\\&/g; s/\$/\\\$/g; s/\^/\\\^/g; s/\~/\$\sim\$/g; s/_/\\_/g; # ---------------- Recognize whole tags: #Equations rendered as pictures, e.g. <\& 32 22 9 "PICT14">: s/<[^<>]*PICT[^<>]*>//g; #Index s/<\*INDEX\s\d+\s\d+\s\d+\s\d+\s\d+\s\("([^"]+)"\s""\s""\)>/\\index{$1}/g; s/<\*INDEX\s\d+\s\d+\s\d+\s\d+\s\d+\s\("([^"]+)"\s""\s""\)\s\("([^"]+)"\s""\s""\)>/\\index{$1!$2}/g; #Greek s/(

)?(.)\s*/\$\\greek{$2}\$/g; s/\\greek{a}/\\alpha /g; s/\\greek{b}/\\beta /g; s/\\greek{c}/\\chi /g; s/\\greek{D}/\\Delta /g; s/\\greek{d}/\\delta /g; s/\\greek{e}/\\epsilon /g; s/\\greek{f}/\\phi /g; s/\\greek{G}/\\Gamma /g; s/\\greek{g}/\\gamma /g; s/\\greek{h}/\\eta /g; s/\\greek{i}/\\iota /g; s/\\greek{j}/\\phi /g; s/\\greek{k}/\\kappa /g; s/\\greek{l}/\\lambda /g; s/\\greek{m}/\\mu /g; s/\\greek{n}/\\nu /g; s/\\greek{p}/\\pi /g; s/\\greek{q}/\\theta /g; s/\\greek{S}/\\Sigma /g; s/\\greek{r}/\\rho /g; s/\\greek{U}/\\Upsilon /g; s/\\greek{t}/\\tau /g; s/\\greek{u}/\\upsilon /g; s/\\greek{W}/\\Omega /g; s/\\greek{w}/\\omega /g; s/\\greek{x}/\\xi /g; s/\\greek{Y}/\\Psi /g; s/\\greek{y}/\\psi /g; s/\\greek{z}/\\zeta /g; # symbols s/\\greek{¢}/'/g; s/\\greek{µ}/\\propto/g; s/\$'\$/'/g; s/°/\\degunit/g; # ----------------- Itty bitties: ------------------------------ s// /g; s// /g; s/\xd5/'/g; s/\xd2/``/g; s/\xd3/''/g; s/\xa1/\\degc/g; s/¹/'/g; s/³/``/g; s/²/''/g; # Dashes s/\321/---/g; s/\320/-/g; #scientific notation s/10<\+>\s*(\+?)(\d+)\s*

/\$10powxxx\{$1$2\}\$ /g; s/10<\+>\s*(\-+|<\\\-+>)(\d+)\s*

/\$10powxxx\{\-$2\}\$ /g; s/powxxx\{(\S)\}/powxxx$1/g; s/powxxx/\136/g; # strange s/<\\\->/---/g; s/<\\\-+>/---/g; # superscripts, subscripts, variables in italics, ... #First, simplify math expressions to make them easier to recognize and deal with later. #This happens /before/ we officially decide whether they're math expressions. s/<\+>\s?(\w+)\s?<.>/\$^{$1}\$/g; s/<\->\s?(\w+)\s?<.>/\$_{$1}\$/g; #s/

[\(\)\[\]\/\+\-]/$1/g; #Leading or trailing whitespace: s/(\s*)([\w \t\+\-\/]+)

/$1$2

/g; s/([\w \t\+\-\/]+[\w\+\-\/])(\s*)

/$1

$2/g; #Try to recognize cases where it's /not/ math: #multiple words, space between them, nothing but alpha and punctuation: s/([a-zA-Z\.\,\;\'\s]+)([a-zA-Z\.\,\;\']+)(\s*)

/\\emph{$1$2}$3/g; # ... doesn't work because of (()) #at least three alphas in a row: s/([a-zA-Z\']{3,})

/\\emph{$1}/g; #Do it: s/\s?(\w+)\s?<(I|P)><\+>(\d)<.>/\$$1^$3\$/g; s/\s?(\w+)\s?<(I|P)><\+>(\w+)<.>/\$$1^{$3}\$/g; s/\s?(\w+)\s?<(I|P)><\->(\w+)<.>/\$$1_{$3}\$/g; s/\s?([^<>]+)\s?

/\$$1\$/g; # ----------------- paragraph breaks ------------------------------ s/<\\<>[^<>]*>/\n\n/g; # ----------------- detect displayed equations ------------------------------ #aligned equations s/\t([^=\n]*)=([^=\n]+)\n+/\t$1 = $2 \\\\\n/g; # --------------- math stuff # minus sign, not m-dash: s/\-\-\-(\d)/-$1/g; s/\-\-\-\$/-\$/g; s/([^\n]*)\-\-\-([^\n]*)\\\\\n/$1-$2\\\\\n/g; # My known habits: s/ ,/ \\qquad ,/g; s/ \./ \\qquad ./g; print TEMPOUT $_; } close (TEMPOUT); #==================================== second pass ============================================ $/ = "\n"; # record delimiter open (TEMPIN,"temp002.tex") or die("Cannot open temp002.tex for output"); while () { #-------------- work more on displayed equations # Eliminate dollar signs in displayed equations: s/(.*)\$(.*)\$(.*)\\\\\n/$1$2$3\\\\\n/g; s/(.*)\$(.*)\$(.*)\\\\\n/$1$2$3\\\\\n/g; s/(.*)\$(.*)\$(.*)\\\\\n/$1$2$3\\\\\n/g; s/(.*)\$(.*)\$(.*)\\\\\n/$1$2$3\\\\\n/g; print TEMPOUT $_; } close (TEMPIN); close (TEMPOUT); #==================================== third pass ============================================ undef $/; open (TEMPIN,"temp003.tex") or die("Cannot open temp003.tex for output"); while () { #------------------- sectioning #<@Chapter title:><\<>Chapter title>0 Introduction and Review s/<[^<>]+><\\<>Chapter title>(\d+)?(\s+)?([^<>]+)\n/\\chapter\{$3\}\n/g; s/<\@Chapter title:>//g; s/<[^<>]+><\\<>Section title>(\d+)?(\s+)?([^<>]+)\n/\\section\{$3\}\n/g; s/<\@Section title:>//g; s/<[^<>]+><\\<>Subsection title>(\d+)?(\s+)?([^<>]+)\n/\\subsection\{$3\}\n/g; s/<\@Subsection title:>//g; # --------------------------Eliminate nearly all tags not dealt with so far: s/<[^@\\<>][^<>]*>//g; s/<\@:>//g; # anonymous paragraph styles s/<\@[^<>=]+=(<\@[^<>]+>)*>\n+//g; # style definitions # --------------------------displayed equations s/(.*)\\\\\n/\\begin\{equation\*\}\n\t$1 \\\\\n\\end\{equation\*\}\n/g; s/\\end\{equation\*\}\s*\\begin\{equation\*\}//g; s/\\\\\n\n/\\\\\n/g; s/\n{2,}\\begin\{equation\*\}/\n\\begin\{equation\*\}/g; s/\\begin\{equation\*\}\n((.*\\\\\n){2,})\\end\{equation\*\}/\\begin\{align\*\}\n$1\\end\{align\*\}/g; s/\[([^\[\]]+)\]\s*\\\\/ \\qquad \\text\{\[$1\]\} \\\\/g; s/\\\\\n\\end\{/\n\\end\{/g; # ----------------- Clean up math ------------------------------ # Beautify scientific notation, e.g. 0.5x$10^{-3}$ s/(\-?)([\d\.]+)x\$10\^/\$$1$2\\times10^/g; s/_o/_\\zu\{o\}/g; s/\$[ \t\d\.]*\$//g; s/\$[ \t\d\.]*=[ \t\d\.]*\$/=/g; # dollar signs with nothing between them but a math symbol: s/\$\s?([\+\-\/\=\(\)\,\|])\s?\$/$1/g; # Simplify one-character subscripts and superscripts: s/\^\{(.)\}/^$1/g; s/_\{(.)\}/_$1/g; # operators: s/(cos|sin|ln|exp|tan)([^a-z])/\\$1 $2/g; # Inch-marks to quotes. # Do this after all tags are dealt with, because tags have inch-mark quotes sometimes. s/"([^"]*)"/``$1''/g; # ------------------Eliminate all remaining tags #<@Body text, no indent:> s/<[^<>]*>//g; s/\n\n\n/\n\n/g; print TEMPOUT $_; } close (TEMPIN); close (TEMPOUT); #==================================== fourth pass ============================================ $/ = "\n"; open (TEMPIN,"-") or die("Cannot open stdout for output"); $in_align = 0; $in_eqn = 0; while () { if (m/\\begin\{align\*\}/) {$in_align = 1; $in_eqn = 1;} if (m/\\end\{align\*\}/) {$in_align = 0; $in_eqn = 0;} if (m/\\begin\{equation\*\}/) {$in_align = 0; $in_eqn = 1;} if (m/\\end\{equation\*\}/) {$in_align = 0; $in_eqn = 0;} if ($in_align) { s/=/&=/g; } s/ _/_/g; if (!$in_eqn) { # catch variables that weren't in italics s/([\s\d])([qwertyuiopsdfghjklzxcvbnmQWERTYUOPSDFGHJKLZXCVBNM])([\s\,])/$1\$$2\$$3/g; # Similar, but avoid misinterpreting abbreviations like "i.e.": s/([\s\d])([qwertyuiopsdfghjklzxcvbnmQWERTYUOPSDFGHJKLZXCVBNM])(\.\s)/$1\$$2\$$3/g; # shuffle dollar signs around s/\s([\da-zA-Z\+\-\_\^\/]+)\$([^\$]+)\$/ \$$1$2\$/g; s/\$([^\$]+)\$([\da-zA-Z\+\-\_\^\/]+)\s/\$$1$2\$ /g; s/_(.)_(.)/_\{$1$2\}/g; s/([a-zA-Z])\$_/\$$1_/g; #s/\$([^\$]*)\$([^ \$]*)\$([^\$]*)\$/\$$1$2$3\$/g;# eat 4 dollar signs -> 2 # ... wrong, because allows blanks, could combine unrelated expressions s/\$([^ .,\$]*)\$([^ .,\$]*)/\$$1$2\$/g; s/([^ .,\$]*)\$([^ .,\$]*)\$/\$$1$2\$/g; s/\$([^ \$]*)\$([^ \$]*)\$([^ \$]*)\$/\$$1$2$3\$/g; # eat 4 dollar signs -> 2 s/ ([^ \$]*)=([^ \$]*) / \$$1=$2\$ /g; s/,\$/\$,/g; s/\.\$/\$\./g; s/\$\s*\$//g; s/\$=\$//g; } if ($in_eqn) { s/\$//g; } $count = 0; while (m/.{70,}/ && $count<50) { s/(.{50,60}) (.{20,})/$1\n$2/; ++$count; } print TEMPOUT $_; } close (TEMPIN); close (TEMPOUT);