#! /bin/perl # # De-moron-ise Text from Microsoft Applications # # by John Walker -- January 1998 # http://www.fourmilab.ch/ # # This program is in the public domain. # # Version History # # 09/05/2001 FatPhil (Phil Carmody) # Modified to cope with MS's symbol fonts, and character entities # 7-bit clean output in skeleton form, but not yet done # New options: -4 -> use HTML 4; -7 -> make 7-bit clean # $lineWrap = 72; # Wrap lines at this column $lineBreak1 = '[<]'; # Line break first pass candidates $lineBreak2 = '[>]'; # Line break second pass candidates # How advanced are we? $dtd = 3; # What do we process? $numerics=1; # numeric codes become meaningful entities $symbolfont=1; # anything in symbol font needs to change! $reserved=1; # reserved 8-bit control characters become entities $eightbit=0; # other 8-bit characters become entities # Symbol font nonsense @Symbol=(); # Process command line options for ($i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ m/^-/) { $o = $ARGV[$i]; splice(@ARGV, $i, 1); $i--; if (length($o) == 1) { last; } $opt = substr($o, 1, 1); $arg = substr($o, 2); # -u -- Print how-to-call information if ($opt eq 'u' || $opt eq '?') { print("Usage: demoroniser [ options ] infile outfile\n"); print(" Options:\n"); print(" -u Print this message.\n"); print(" -wcols Wrap lines at cols columns, 0 = no wrap.\n"); exit(0); # -wcols -- Wrap lines at cols columns, 0 = no wrap } elsif ($opt eq 'w') { if ($arg =~ m/^\d+$/ && $arg >= 0) { $lineWrap = $arg; if ($lineWrap == 0) { $lineWrap = 1 << 31; } } else { die("Invalid wrap length '$arg' in -w option.\n"); } } elsif ($opt eq '4') { $dtd = 4; } elsif ($opt eq '7') { $eightbit = 1; } } } if($symbolfont) { &SymbolAny(); $dtd>3 ? &Symbol4() : &Symbol3(); } # Open input and output files $if = STDIN; $of = STDOUT; $ifname = "(stdin)"; if ($#ARGV >= 0) { $if = IF; open($if, "<$ARGV[0]") || die("Cannot open input file $ARGV[0]: $!\n"); $ifname = $ARGV[0]; } if ($#ARGV >= 1) { $of = OF; open($of, ">$ARGV[1]") || die("Cannot open output file $ARGV[1]: $!\n"); } $iline = 0; $oline = 0; while ($l = <$if>) { $iline++; $l1 = &demoronise($l); &printWrap($l1); } close($if); close($of); # demoronise -- Translate moronic Microsoft bit-drool into # vaguely readable and compatible HTML. sub demoronise { local($s) = @_; local($i, $c); # lets cache our findings about the string? study($s); # Eliminate idiot MS-DOS carriage returns from line terminator $s =~ s/\s+$//; $s .= "\n"; # Turn MS symbol font into real characters if($symbolfont && ($s =~ m|(\s*)([^<]*)()|i) ) { local @parts = split(/<\/FONT>/i, $s); local $build=''; foreach(@parts) { local $p = $_; if($p =~ m/(.*)(\s*)(.*)$/i) { $prefix=$1; $junk=$2; $b=$3; local $n = ''; local $i; for($i=0; $i'; } } # now every character is translated, add to the building string $build .= $prefix . $n; } else { # This was not a symbol section, output unchanged $build .= $p; } } $s = $build; } if($reserved && ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) ) { # Map strategically incompatible non-ISO characters in the # range 0x82 -- 0x9F into plausible substitutes where # possible. if($dtd>=4) { $s =~ s/\x80/€/g; } # unknown 0x81 $s =~ s/\x82/,/g; $s =~ s-\x83-f-g; # not in Latin-2 if($dtd<4) { $s =~ s/\x84/,,/g; } else { $s =~ s/\x84/„/g; } if($dtd<4) { $s =~ s/\x85/.../g; }else { $s =~ s/\x85/…/g; } if($dtd>=4) { $s =~ s/\x86/†/g; } if($dtd>=4) { $s =~ s/\x87/‡/g; } $s =~ s/\x88/^/g; # not in Latin-2 if($dtd<4) { $s =~ s-\x89- °/°°-g;}else{ $s =~ s/\x89/‰/g; } if($dtd>=4) { $s =~ s/\x8A/Š/g; } if($dtd<4) { $s =~ s/\x8B/~-g; } else { $s =~ s/\x98/˜/g; } if($dtd<4) { $s =~ s-\x99-TM-g; } else { $s =~ s/\x99/®/g; } if($dtd>=4) { $s =~ s/\x9A/š/g; } if($dtd<4) { $s =~ s/\x9B/>/g; } else { $s =~ s/\x8B/›/g; } if($dtd<4) { $s =~ s/\x9C/oe/g; } else { $s =~ s/\x8C/œ/g; } # Now check for any remaining untranslated characters. if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) { $r = $s; $r =~ tr/[\x00-\x08\x10-\x1F\x80-\x9F]//cds; for($i=0; $i=3) { $s =~ s/\xB1/±/g; $s =~ s/\xF7/÷/g; } } # Supply missing semicolon at end of numeric entity if # Billy's bozos left it out. $s =~ s/(&\#[0-2]\d\d)\s/$1; /g; # Fix dimbulb obscure numeric rendering of < > & if($numerics) { $s =~ s/&/&/g; $s =~ s/</</g; $s =~ s/>/>/g; } # Fix unquoted non-alphanumeric characters in table tags $s =~ s/(\s*.*)(\s*)-$1-gi; # Translate bonehead PowerPoint misuse of
    to achieve # paragraph breaks. $s =~ s-

    \s*

      -

      -gi; $s =~ s-

      -

      -gi; $s =~ s-

    \s*

    --gi; # Repair PowerPoint depredations in "text-only slides" $s =~ s-

    --gi; $s =~ s-

    -

    -ig; $s; } # printWrap -- Print one or more lines with wrap at # the specified column. sub printWrap { local($s) = @_; local($l, $sep, $rem, $ter, $lwrap, $indent); # Pick the input apart line by line and reformat each line, # if necessary, so as not to exceed the maximum line length. $s =~ m/(\s*)(\S)/; $indent = $1; if ($2 eq '<') { $indent .= ' '; } while (length($s) > 0) { if (($s =~ s/(.*\n)//) != 1) { $aax = $_[0]; print("printWrap arg = |$aax|\n"); print("printWrap s = |$s|\n"); $aal = length($s); print("printWrap length(s) = $aal\n"); die("$ifname: Error splitting lines."); } $l = $1; $sep = ''; $lwrap = ''; while (length($l) > $lineWrap) { if (($l =~ s/(^.{1,$lineWrap})(\s)//o) || ($l =~ s/(^.{1,$lineWrap})($lineBreak1)//o) || ($l =~ s/(^.{1,$lineWrap})($lineBreak2)//o) ) { $rem = $1; $ter = $2; if ($ter =~ m/\s+/) { $ter=''; } $lwrap .= "$sep$rem$ter\n"; $oline++; $l =~ s/^\s*//; $sep = $indent; } else { last; } } print($of "$lwrap$sep$l"); $oline++; } } # Define those silly symbols sub Symbol4 { $Symbol[0x20] = '∀'; $Symbol[0x24] = '∃'; $Symbol[0x27] = '∋'; $Symbol[0x2d] = '−'; # $Symbol[0x40] = '(nearlycongruent)'; $Symbol[0x43] = 'Χ'; $Symbol[0x44] = 'Δ'; $Symbol[0x46] = 'Φ'; $Symbol[0x47] = 'Γ'; # $Symbol[0x4a] = '(curlyJ)'; $Symbol[0x4c] = 'Λ'; $Symbol[0x50] = 'Π'; $Symbol[0x51] = 'Θ'; $Symbol[0x53] = 'Σ'; # $Symbol[0x56] = '(smallsquiggle)'; $Symbol[0x57] = 'Ω'; $Symbol[0x58] = 'Ξ'; $Symbol[0x59] = 'Ψ'; $Symbol[0x5c] = '∴'; $Symbol[0x5e] = '⊥'; $Symbol[0x61] = 'α'; $Symbol[0x62] = 'β'; $Symbol[0x63] = 'χ'; $Symbol[0x64] = 'δ'; $Symbol[0x65] = 'ε'; $Symbol[0x66] = '(smallphi)'; $Symbol[0x67] = 'γ'; $Symbol[0x68] = 'η'; $Symbol[0x69] = 'ι'; $Symbol[0x6a] = 'φ'; $Symbol[0x6b] = 'κ'; $Symbol[0x6c] = 'λ'; $Symbol[0x6d] = "\xb5"; # 'μ'; $Symbol[0x6e] = 'ν'; $Symbol[0x6f] = 'ο'; $Symbol[0x70] = 'π'; $Symbol[0x71] = 'θ'; $Symbol[0x72] = 'ρ'; $Symbol[0x73] = 'σ'; $Symbol[0x74] = 'τ'; $Symbol[0x75] = 'υ'; # $Symbol[0x76] = '(omegabar)'; $Symbol[0x77] = 'ω'; $Symbol[0x78] = 'ξ'; $Symbol[0x79] = 'ψ'; $Symbol[0x7a] = 'ζ'; # $Symbol[0xa1] = '(curlyY)'; $Symbol[0xa2] = '′'; $Symbol[0xa3] = '≤'; $Symbol[0xa5] = '∞'; $Symbol[0xa7] = '♣'; $Symbol[0xa8] = '♦'; $Symbol[0xa9] = '♥'; $Symbol[0xaa] = '♠'; $Symbol[0xab] = '↔'; $Symbol[0xac] = '←'; $Symbol[0xad] = '↑'; $Symbol[0xae] = '→'; $Symbol[0xaf] = '↓'; $Symbol[0xb2] = '″'; $Symbol[0xb3] = '≥'; $Symbol[0xb4] = '×'; # $Symbol[0xb5] = '(proportional)'; $Symbol[0xb6] = '∂'; $Symbol[0xb7] = ''; $Symbol[0xb9] = '≠'; $Symbol[0xba] = '≡'; # $Symbol[0xbb] = '(roughlyequal)'; $Symbol[0xbc] = '…'; # $Symbol[0xbd] = '(vbar)'; $Symbol[0xbe] = '—'; $Symbol[0xbf] = '↵'; $Symbol[0xc0] = '&alephsym;'; $Symbol[0xc1] = 'ℑ'; $Symbol[0xc2] = 'ℜ'; $Symbol[0xc3] = '℘'; $Symbol[0xc4] = '⊗'; $Symbol[0xc5] = '⊕'; $Symbol[0xc6] = '∅'; $Symbol[0xc7] = '∩'; $Symbol[0xc8] = '∪'; $Symbol[0xc9] = '⊃'; $Symbol[0xca] = '⊇'; $Symbol[0xcb] = '⊄'; $Symbol[0xcc] = '⊂'; $Symbol[0xcd] = '⊆'; $Symbol[0xce] = '∈'; $Symbol[0xcf] = '∉'; $Symbol[0xd0] = '∠'; $Symbol[0xd1] = '∇'; $Symbol[0xd2] = '®'; $Symbol[0xd3] = '©'; $Symbol[0xd4] = '™'; $Symbol[0xd5] = '∏'; $Symbol[0xd6] = '√'; $Symbol[0xd7] = '(implies)'; $Symbol[0xd8] = '¬'; $Symbol[0xd9] = '∧'; $Symbol[0xda] = '∨'; $Symbol[0xdb] = '⇔'; $Symbol[0xdc] = '⇐'; $Symbol[0xdd] = '⇑'; $Symbol[0xde] = '⇒'; $Symbol[0xdf] = '⇓'; $Symbol[0xe0] = '◊'; $Symbol[0xe1] = '⟨'; $Symbol[0xe2] = "\xae"; # '®'; $Symbol[0xe3] = "\xa9"; # '©'; $Symbol[0xe4] = '™'; $Symbol[0xe5] = '∑'; $Symbol[0xe9] = '⌈'; $Symbol[0xea] = '|'; $Symbol[0xeb] = '⌊'; $Symbol[0xf1] = '⟩'; # $Symbol[0xf2] = '(integral)'; } sub SymbolAny { $Symbol[0x55] = 'Y'; $Symbol[0xa4] = '/'; $Symbol[0xa6] = 'f'; $Symbol[0xb1] = "\xB1"; # '±'; $Symbol[0xb8] = "\xF7"; # '÷'; } sub Symbol3 { $Symbol[0x2d] = '-'; $Symbol[0xa2] = "'"; $Symbol[0xa3] = '<='; $Symbol[0xab] = '<->'; $Symbol[0xac] = '<-'; $Symbol[0xae] = '->'; $Symbol[0xb2] = "'"; $Symbol[0xb3] = '>='; $Symbol[0xb4] = '*'; $Symbol[0xb7] = '*'; $Symbol[0xb8] = '/'; $Symbol[0xb9] = '=!='; $Symbol[0xba] = '=='; $Symbol[0xbb] = '~='; $Symbol[0xbc] = '...'; $Symbol[0xbd] = '|'; $Symbol[0xbe] = '--'; $Symbol[0xc1] = 'I'; $Symbol[0xc2] = 'R'; $Symbol[0xc3] = 'P'; $Symbol[0xd2] = "\xae"; # '(R)'; $Symbol[0xd3] = "\xa9"; # '(c)'; $Symbol[0xd4] = 'TM'; $Symbol[0xd9] = '&'; $Symbol[0xda] = '|'; $Symbol[0xdb] = '<=>'; $Symbol[0xdc] = '<='; $Symbol[0xde] = '=>'; $Symbol[0xdf] = '⇓'; $Symbol[0xe1] = '<'; $Symbol[0xe2] = '(R)'; $Symbol[0xe3] = '(c)'; $Symbol[0xe4] = 'TM'; $Symbol[0xf1] = '>'; }