#!/usr/bin/perl 
#
# Mark Doll <markdoll@gmx.net>, 2006.
#
# This is a perl implementation of TeXdiff, derived from the original bash and
# perl scripts created by Robert Maron <robmar@mimuw.edu.pl>, available at
# http://www.robmar.net/TexDiff/. This version of texdiff is available at
# http://mark.doll.name/texdiff/.
#
# usage: texdiff old.tex new.tex [diff.tex]
#
# requires the following LaTeX code in the preamble of the LaTeX document:
#
# \usepackage{color} \usepackage{ulem} \usepackage{changebar}
# \newcommand\TLSins[1]{\cbstart{}\textcolor{ins}{\uline{#1}}\cbend{}}
# \newcommand\TLSdel[1]{\cbdelete{}\textcolor{del}{\sout{#1}}}
#
# These macros are NOT automatically interted at \begin{document} like
# the original scripts did it, because this will not work on files of
# a multi-file document (those included by \include or \input).
# Furthermore, if you insert the macros manually, you can tune them as
# you like, i. e. change text color or position of changebars. Have a
# look at the documentation of the ulem and changebar packages on how
# to do this!
#

use strict;

## Problematic commands, that will be removed from deletions and moved
## out of insertions. The array index equals the number of mandatory
## command parameters.

## INSERT YOUR HOMEBREW MACROS HERE!!!
# Omit leading backslash and separate by '|', since this will be
# interpreted as a regular expression.
my @mycommands = (
  '',
  '',
  '',
  '',
  '',
  'Abb(?:ildung)?(?:p[psw])?',
);

## standard LaTeX macros
#FIXME: add more macros here!
my @stdcommands = (
  '\\\\',   # will match one backslash inside regex
);



## allow nesting of braces (aka curly brackets) up to 10**$w-1 levels
my $w = 2;

## wdiff marker
my $delstart = "TLSdel";
my $delend   = "TLEdel";
my $insstart = "TLSins";
my $insend   = "TLEins";

## LaTeX macros used to mark deletions/insertions in the output
my $texdelstart = "\\protect\\TLSdel\{";
my $texdelend   = "\}";
my $texinsstart = "\\protect\\TLSins\{";
my $texinsend   = "\}";


## parse command line
my ($in1,$in2,$out) = @ARGV;

## create temp files from input files with all comments removed and
## all braces numbered
write_temp($w,$in1,'tmp1.'.$$);
write_temp($w,$in2,'tmp2.'.$$);
sub write_temp {
    my ($w,$file,$tmp) = @_;

    $_ = `cat $file`;

    ## mark pairs of braces with the same $w digits.
    # i. e. convert "{ { {} } {}}" to "{01 {02 {03}03 }02 {02}02}01" ($w=2).

    my $max=10**$w-1;
    $::cnt = 0;
    s/
      (?<!\\)
      (?:
        \{ (?{ die "Fatal: Braces nested deeper than $max!\n" if ($::cnt >= $max);
               sprintf("\{%0${w}d",++$::cnt) }) |
        \} (?{ sprintf("\}%0${w}d",$::cnt--) })
      )
    /$^R/gx;

    print STDERR "Warning: ".abs($::cnt)." unmatched ".($::cnt > 0 ? "opening" : "closing")
         ." brace".(abs($::cnt) == 1 ? "" : "s")."\n" if ($::cnt != 0);

    ## remove comments
    # also remove trailing newline and all whitespace at the
    # beginning of the following line like TeX does it
    s/%.*?\n[ \t]*//g;

    ## output
    open(O,">$tmp") or return("Can't open $tmp: $!\n");
    print O;
    close O;
}

my $wdiff_cmd = ( "wdiff"
                . " --start-delete=\'$delstart\' --end-delete=\'$delend\'"
                . " --start-insert=\'$insstart\' --end-insert=\'$insend\'"
                . " tmp1.$$ tmp2.$$");
$_ = `$wdiff_cmd`;



###
### Phase I: generic processing
###

## remove paragraph boundaries (two newlines) that wdiff falsely inserted
# FIXME: Does wdiff always insert same spacing before deletion and the
# following insertion?!
s/(
    ([ \t]*\n[ \t]*\n[ \t]*)
    $delstart (?!$delend)
    (?: . (?!$delend) )* .
    $delend
  )
  \2
  $insstart
 /$1\n$insstart/gsx;


## Restore backslashed Spaces (prevents falsely escaped right braces)
# Escaped spaces '\ ' at the end of an insertion/deletion will result in the
# backslash before the end marker and the space after it. Therefore swap
# whitespace and end marker. (This might falsely make the insertion/deletion
# spann the following paragraph boundary, but this will be fixed by the next
# step below).  When the end markers finally get replaced by $tex(ins|del)end,
# which typically is a '}', without this swapping, this would result in escaped
# right braces and LaTeX complaining about a missing right brace.
s/(?<!\\)\\($delend|$insend)(\s+)/\\$2$1/g;


## split up insertions and deletions that span multiple paragraphs
while (
  s/(
      $delstart (?!$delend)
      (?: . (?! [ \t]*\n[ \t]*\n[ \t]* | $delend ) )*? .?
    )
    ([ \t]*\n[ \t]*\n[ \t]*)
   /$1$delend$2$delstart/gsx ) {}

while (
  s/(
      $insstart (?!$insend)
      (?: . (?! [ \t]*\n[ \t]*\n[ \t]* | $insend ) )*? .?
    )
    ([ \t]*\n[ \t]*\n[ \t]*)
   /$1$insend$2$insstart/gsx ) {}


## Handle unpaired braces
# Search for all unpaired braces, meaning, they are inside a text
# deletion or insertion passage, but their counterpart is not within
# that passage. If they are located inside a deletion passage, simply
# delete them.  If there a inside a insertion passage, move them out
# by placing an insertion end marker before and an insertion start
# marker after the brace. In case that a left brace belongs to a
# preceding LaTeX command (plus and any more zero-parameter commands
# directly preceding that command; that's matched by a non-whitespace
# character sequence started by a backslash), also delete resp. move
# that/these command(s) together with the left brace.
#
# FIXME: why is while loop necessary here, but not for handling right braces?
#
# FIXME: If there's not at least one character before the character
# sequence specified in the look-ahead negative assertion, the match
# would stop at the second occurence of that sequence instead of the
# first. This leads to the effect, that the following regular
# expressions may skip $(del|ins)end if it follows directly the left
# brace (+number). The use of *? instead of * mitigates this effect,
# but it's not perfect.

## 1. unpaired left brace inside a deletion: delete brace and any preceding LaTeX command(s)
while (
  s/(						# $1: everything till unpaired '{'
      $delstart (?!$delend)			# would otherwise not detect end marker of empty deletions!
      (?: . (?!$delend))*?
    )
    ( (?<!\\) \\\S* )?				# $2: optional command(s) preceding the '{'
    (?<!\\) \{([0-9]{$w}) (?!\}\3)		# '{' and $3: number
    (						# $4: everything what's remaining
      (?: . (?! (?<!\\)\}\3 | $delend) )*? .?	# will stop at a matching '}', too
      $delend					# won't match if preceeding line found a matching '}'
    )
   /$1$4/sgx ) {}				# remove the '{' with number and any preceeding command(s)

## 2. unpaired left brace inside an insertion: move brace out including any preceding LaTeX command(s)
while (
  s/(
      $insstart (?!$insend)
      (?: . (?!$insend))*?
           )
           ( \s* (?: (?<!\\) \\\S* )? )		# also move preceeding whitespace preceding out
    (?<!\\) \{([0-9]{$w}) (?!\}\3)
    (
      (?: . (?! (?<!\\)\}\3 | $insend) )*? .?
      $insend
    )
   /$1$insend$2\{$3$insstart$4/sgx ) {}		# move the '{' with number and any preceeding command(s) out

## 3. unpaired right brace inside a deletion: delete brace
s/(							# $1: everything till unpaired '}'
    $delstart (?!$delend)				# would otherwise not detect end marker of empty deletions!
    (?> (?:						## grab all paired braces (prevent backtracking):
      (?: . (?! (?<!\\)\{[0-9]{$w} | $delend) )*? .?	# any text till next (unescaped) '{'
      \{([0-9]{$w})					# '{' with $2: number
      (?: . (?! (?<!\\)\}\2        | $delend) )*? .?	#any text till matching '}'
      \}\2						# '}' with same number $2
    )* )						## end of disabled backtracking
    (?: . (?!$delend))*?				# any text
  )
  ( \}[0-9]{$w} )					# $3: the (first) unpaired '}' with number
  (							# $4: everything what's remaining
    (?: . (?!$delend))*? .?
    $delend
  )
 /$1$4/sgx;						# remove the unpaired '}' with number

## 4. unpaired right brace inside an insertion: move brace out
s/(
    $insstart (?! $insend)
    (?> (?:
      (?: . (?! (?<!\\)\{[0-9]{$w} | $insend ) )*? .?
      \{([0-9]{$w})
      (?: . (?! (?<!\\)\}\2        | $insend ) )*? .?
      \}\2
    )* )
    (?: . (?!$insend))*?
  )
  ( \}[0-9]{$w} \s* )				# also move whitespace following the '}' out
  (
    (?: . (?!$insend))*? .?
    $insend
  )
 /$1$insend$3$insstart$4/sgx;				# move the unpaired '}' with number out


## concatenate consecutive insertions and consecutive deletions
# Don't concatenate over paragraph (two newlines) boundaries, since
# the LaTeX macros used to mark changed passages can't span multiple
# paragraphs. Run this before handling LaTeX-commands specifically to
# allow as much of a command inside a single passage as possible.
s/$delend([ \t]*\n?[ \t]*)$delstart/$1/g;
s/$insend([ \t]*\n?[ \t]*)$insstart/$1/g;



###
### Phase II: LaTeX-command specific processing
###

## For COMPLETELY removed/inserted sections, move section command out.
#
# May create empty insertions/removals, therefore execute this before
# the removal of empty (or whitespace only) insertions/removals!
#
# Partly changed section titles have been handled by the preceding
# rules. In case of removal, prevent renumbering by using asterisk
# form; in case of insertion, add a short form without markers to
# prevent problems when LaTeX'ing the diff.
my $sectionre = "chapter|section|subsection|subsubsection|paragraph";
s/
  (						# $1: text before section command
    $delstart (?!$delend)
    (?: . (?!$delend) )*?
  )
  \\($sectionre)				# '\' and $2: section command
  \{([0-9]{$w})					# '{' and $3: number
  ( (?: . (?! (?<!\\)\}\3 | $delend) )*? .? )	# $4: section command parameter (title)
  ( \}\3 \s* )				# '}' and same number plus following whitespace
  /$1$delend\\$2\*\{$3$delstart$4$delend$5$delstart/sgx;
  # move section commmand out and convert command to asterisk form

s/
  (						# $1: text before section command
    $insstart (?!$insend)
    (?: . (?!$insend) )*?
  )
  \\($sectionre)				# '\' and $2: section command
  \{([0-9]{$w})					# '{' and $3: number
  ( (?: . (?! (?<!\\)\}\3 | $insend) )*? .? )	# $4: section command parameter (title)
  ( \}\3 \s* )				# '}' and same number plus following whitespace
  /$1$insend\\$2\[$4\]\{$3$insstart$4$insend$5$insstart/sgx;
  # move sections command out and add short form without markers


## Remove problematic commands from deletions and moved them out of insertions
for my $num (0 .. ($#mycommands>$#stdcommands ? $#mycommands : $#stdcommands)) {
  my $command = "";

  $command .= $mycommands[$num]	 if ($mycommands[$num]                      );
  $command .= "|"                if ($mycommands[$num] && $stdcommands[$num]);
  $command .= $stdcommands[$num] if (                     $stdcommands[$num]);

  next if(!$command);

  s/
    (						# $1: anything before the command
      $delstart (?!$delend)
      (?: . (?!$delend) )*? .?
    )
    (						# $2: the command with all its parameters
      \\(?:$command)				# the command itself (preceded by a '\')
      (?: \[ .*? (?<!\\)\] )*			# any optional parameters (enclosed in '[]')
      (?: \{([0-9]{$w}) .*? (?<!\\)\}\3 ){$num} # $num mandatory parameters
    )
   /$1/gsx;

  s/(
      $insstart (?!$insend)
      (?: . (?!$insend) )*? .?
    )
    (
      \\(?:$command)
      (?: \[ .*? (?<!\\)\] )*
      (?: \{([0-9]{$w}) .*? (?<!\\)\}\3 ){$num}
    )
   /$1$insend$2$insstart/gsx;
}



###
### Phase III: cleaup, conversion, output
###

## remove insertions and removals that are empty or only mark whitespace
s/$delstart(\s*)$delend/$1/g;
s/$insstart(\s*)$insend/$1/g;


## remove brace numbers
s/(?<!\\)(?:(\{|\})[0-9]{$w})/$1/g;


## substitute final LaTeX macro code
s/$delstart/$texdelstart/g;
s/$delend/$texdelend/g;
s/$insstart/$texinsstart/g;
s/$insend/$texinsend/g;


## output
if (open(O,">$out")) {
    print O;
    close O;
} else {
    print;
}

## remove temp files
unlink('tmp1.'.$$);
unlink('tmp2.'.$$);
