#! /usr/local/bin/perl -w
use strict;

# (C) 2007, Benoit Hudson
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# Create a graph of a set of proofs, from LaTeX sources.  Output is in the
# graphviz 'dot' format.
# The following will generate a pdf of your proof:
#   parse-proof foo.tex | dot -Tpdf -o foo.pdf

# Upon a \label in a theorem-like environment (what counts as theorem-like is
# mentioned below), stores the name of the theorem.  In the subsequent proof
# environment, any \ref will create an edge to the theorem from the reference.
# You can use %\ref or \comment{\ref{...}} to avoid having the ref appear in
# the text.
# TODO: begin and end statements for all environments must be at the beginning
# of the line, or weird things may happen.  This is because I'm lazy.
# TODO: I completely ignore comments.  Commented-out sections of proofs will
# generate edges just as surely as anything else.
#
# Theorem-like environments are labeled below in %nickname.  The name of the
# label must also match the environment: in a theorem, use \label{thm:foo}; in
# a lemma, lem:, and so on.  All other labels are ignored.  I assume that
# statements don't nest (i.e. we don't state a lemma while stating a theorem).
#
# In the output graph, definitions (def) are boxes, theorems (thm) are
# diamonds, everything else is the default.


# Generalize over kinds of statements.  We ignore labels, refs, and
# environments not in this list.
my %environment = (
  'thm' => 'theorem',
  'fact' => 'fact',
  'def' => 'definition',
  'lem' => 'lemma'
);
my %nickname;
for my $key (keys %environment) {
  $nickname{$environment{$key}} = $key;
}

# Parsing information.
my $filename;
my $theorem_name;
my $theorem_type;
my $theorem_long_name;

# The final output.
my %edges; # keyed by \ref, points to list of theorem_name
my %nodes; # keyed by theorem_name, points to the optional long name or undef.

my @states = ('NONE', 'INTHM', 'HAVE_THM', 'INPROOF');
my $state = 'NONE';

sub warning {
  print STDERR ("$filename:$.: warning: ", @_, "\n");
}
sub warn_unnamed {
  warning "unnamed $environment{$theorem_type}";
}
sub warn_unproved {
  warning ("unproved $environment{$theorem_type} ", getName());
}

sub getName {
  if (!defined $theorem_name) {
    warn_unnamed();
    $theorem_name = "$theorem_type:$filename:$.";
  }
  return $theorem_name;
}

sub makeNode {
  die unless defined $theorem_type;
  $nodes{getName()} = $theorem_long_name;
}


sub parse {
  $filename = shift @_;
  open(FILE, $filename) or do {
    warn "Can't open $filename: $!\n";
    return;
  };
  $. = 0;

  while(<FILE>) {
    chomp;

    if (/^\\begin{([^}]*)}/) {
      my $env = $1;
      if (exists $nickname{$env}) {
        $theorem_type = $nickname{$env};
        if (/\[([^\]]*)\]/) {
          $theorem_long_name = $1;
          $theorem_long_name =~ s/\\/\\\\/;
        } else {
          $theorem_long_name = undef;
        }
        $state = 'INTHM';
      } elsif ($env eq 'proof') {
        unless ($state eq 'HAVE_THM') {
          warning "proof without theorem";
          next;
        }
        makeNode();
        $state = 'INPROOF';
      }
    }
    elsif (/^\\end{([^}]*)}/) {
      my $env = $1;
      if ($env eq 'proof') {
        $theorem_type = undef;
        $theorem_name = undef;
        $state = 'NONE';
      }
      elsif ($state eq 'INTHM' && $env eq $environment{$theorem_type}) {
        $state = 'HAVE_THM';
      }
    }
    elsif (/\\label{([^}]*)}/) {
      my $label = $1;
      if ($state eq 'INTHM' && $label =~ /^$theorem_type/) {
        $theorem_name = $label;
        makeNode();
      }
      # TODO
      warning "multiple labels" if /\\label.*\\label/;
    }
    elsif ($state ne 'NONE' && /\\ref{([^\}]*)}/) {
      my $ref = $1;
      if (exists $edges{$ref}) {
        push(@{$edges{$ref}}, getName());
      } else {
        $edges{$ref} = [getName()];
      }
      warning "multiple refs" if /\\ref.*\\ref/;
    }
  }
}

##################
## output

# Print one node.  Does not print nodes already printed.
# Adds options in [] if there are any to add.  Particularly, this is how I set
# the shape of def and thm nodes, and the label in general.
my %printed_nodes;
sub printNode {
  my $node = shift @_;
  return if $printed_nodes{$node}++;

  print "  \"$node\"";

  # collect a number of options to print with the node.
  my @options;

  # print the long name, if any, as the label to use for the node
  if (defined $nodes{$node}) {
    push(@options, "label=\"$nodes{$node}\"");
  }

  # set the shape according to type
  if ($node =~ /^thm:/) {
    push(@options, "shape=diamond");
  } elsif ($node =~ /^def:/) {
    push(@options, "shape=box");
  }

  # print the options
  print ' [', join(' ', @options), ']' if @options;
  print ";\n";
}

# print the nodes, and then the edges
sub output {
  print "digraph proof {\n";

  # collect all nodes, whether we saw them defined or not
  my @nodes = keys %nodes;
  for my $from (keys %edges) {
    push(@nodes, $from);
    for my $to (@{$edges{$from}}) {
      push(@nodes, $to);
    }
  }

  for my $node (sort @nodes) {
    printNode($node);
  }

  for my $from (sort keys %edges) {
    for my $to (@{$edges{$from}}) {
      print "  \"$from\" -> \"$to\";\n";
    }
  }
  print "}\n";
}

###########################################################################
## Mainline!
for (@ARGV) {
  parse $_;
}
output();