User: Jmath666/latex2wiki.pl
- !/usr/bin/perl
- translate LaTeX to wiki
- written and maintained by User:Jmath666
- with code contributions from User:Oleg Alexandrov
- archived at User:Jmath666/latex2wiki.pl
- February 2007
- usage:
- 1. edit $standalone=1
- ./latex2wiki.pl file.tex > file.wiki
- cat file.tex file.bbl | ./latex2wiki.pl - > file.wiki
- 2. as cgi script
use strict; # 'strict' insists that all variables be declared
use diagnostics; # 'diagnostics' expands the cryptic warnings
- parameters
my $standalone=0; # 0 to run as cgi, 1 from command line
my $png_inline=0; # 0 render inline math as PNG: no, 1 \, 2 \,\!
my $png_display=0; # 0 render display math as PNG: no, 1 \, 2 \,\!
if (!$standalone) {
require "cgi-lib.pl"; # can take this out if $standalone=1 below
}
undef $/; # undefines the separator. Can read one whole file in one scalar.
MAIN: {
my (%input, $file, $ltext);
if($standalone){
$file = $ARGV[0]; # the command line argument
# read the data from $file into $text
open (FILE, "<$file"); $ltext =
} else {
# Read in all the variables set by the form
&ReadParse(\%input);
$ltext=$input{'ltext'};
}
# Print the header
print "Content-type: text/html\n\n";
# process the text
$ltext = &parse_latex ($ltext);
# print the processed text in a Wikipedia textbox
if (!$standalone) {&print_head()};
print "$ltext\n";
if (!$standalone) {&print_foot()};
}
sub parse_latex{
my $text = shift;
my ($ms, $me_i, $me_d);
$text =~ s/%.*\n//g; # get rid of comments
$text =~ s/\r//g; # get rid of carriage returns
# emphasize an empty line with more returns
$text =~ s/\n[\t ]*\n/\n\n\n/g;
# rm otherwise newlines
$text =~ s/[ \t]*\n[ \t]*([^\n])/ $1/g;
# get rid of artefact space
$text =~ s/\\ \s*/ /sg; # get rid of explicit space
$text =~ s/~/ /g; # get rid of unbreakable space
# strip the preamble
$text =~ s/^.*?\\begin\{document\}//sig;
$text =~ s/^.*?\\maketitle//sig;
# strip end document but leave after
$text =~ s/\\end\{document\}//sig;
# strip abstract
$text =~ s/\\begin\{abstract\}(.*?)\\end\{abstract\}/\n$1\n\n/sg;
# fix some missing TeX macros
$text =~ s/\\widetilde/\\tilde/sg;
$text =~ s/\\operatorname\*\{(\w*?)\}/\\mathrm\{$1\}/sg;
$text =~ s/\\allowbreak//sg;
# $text =~ s/\\right([^\w])/$1/sg;
# $text =~ s/\\left([^\w])/$1/sg;
$text =~ s/\\-//sg;
$text =~ s/\\_/_/sg;
$text =~ s/\\textquotedblleft/\&ldquo\;/gs;
#$text =~ s/\\textquotedblright/\&rdquo\;/gs;
$text =~ s/\\begin\{center\}(.*?)\\end\{center\}/$1/sg;
# math tags
$ms='
if($png_inline==0) {$me_i='';}
if($png_inline==1) {$me_i='\\,';}
if($png_inline==2) {$me_i='\\,\\!';}
if($png_display==0) {$me_d='';}
if($png_display==1) {$me_d='\\,';}
if($png_display==2) {$me_d='\\,\\!';}
$text =~ s/\s*\$\$\s*(.*?)\s*\$\$\s*/\n\n:$ms$1$me_d\n\n/sg;
$text =~ s/\$(.*?)\$/$ms\\textstyle $1$me_i/g;
$text =~ s/\\begin\{equation\}(.*?)\\end\{equation\}/\n\n:$ms$1$me_d\n\n/sg;
$text =~ s/\\begin\{equation\*\}(.*?)\\end\{equation\*\}/\n\n:$ms$1$me_d\n\n/sg;
$text =~ s/\\\[(.*?)\\\]/\n\n:$ms$1$me_d\n\n/sg;
$text =~ s/\\begin\{align\}(.*?)\\end\{align\}/\n\n:$ms\\begin\{align\}$1\\end\{align\}$me_d\n\n/sg;
$text =~ s/\\begin\{align\*\}(.*?)\\end\{align\*\}/\n\n:$ms\\begin\{align\}$1\\end\{align\}$me_d\n\n/sg;
# get rid of all labels and references to them
$text =~ s/\\label\{.*?\}//g;
$text =~ s/\\ref\{.*?\}//g;
# get rid of all tex definitions
$text =~ s/\\def\\.*?\{.*?\}//g;
# convert sections and subsections
$text =~ s/\s*\\section.*?\{(.*?)\}\s*/\n\n==$1==\n\n/sig;
$text =~ s/\s*\\subsection.*?\{(.*?)\}\s*/\n\n===$1===\n\n/sig;
# parse bibliography into %bib with key=label value=entry
my ($btext, %bib, $mark, @bibarray);
$btext = $text;
$btext =~ s/\s*\\newblock\s*/ /g; # odd bibtex command
$btext =~ s/\s*\\end\{thebibliography\}.*$//s; # strip all after bib entries
$mark="__bib__";
$btext =~ s/\\bibitem\{(.*?)\}/$mark$1$mark/g; # bibitem -> mark
if ($btext =~ /$mark/) {
$btext =~ s/^.*?$mark//s; # strip all before bib entries
$btext =~ s/\{(\w)\}/$1/sg;
$btext = &convert_font ($btext);
#$btext =~ s/\{(.*?)\}/$1/sg; # get rid of {}
%bib = split($mark,$btext);
} else {
%bib = ();
}
$text = &convert_font ($text);
# replace the bibliography section
$text =~ s/\s*\\begin\{thebibliography\}.*?\\end\{thebibliography\}/\n\n==References==\n\n
# preprocess references make by alternatives to \cite
$text =~ s/\\citet\{/\\cite\{/sg;
$text =~ s/\\citep\{/\\cite\{/sg;
# get rid of optional arguments to \cite
$text =~ s/\\cite\[.*?\]/\\cite/sg;
$text =~ s/\\citep\[.*?\]/\\cite/sg;
$text =~ s/\\citet\[.*?\]/\\cite/sg;
# split \cite{a,b,..} into separate \cite
my $e;
do {
$e= ($text =~ s/\\cite\{([^\}]*?),(.*?)\}/\\cite\{$1\}\\cite\{$2\}/s);
} while ($e);
# add references per Wikipedia:Footnote
# replace \cite{foo} by entry or
my ($bibkey);
# replace first occurence by full entry
# and remaining occurences by terminated tag
foreach $bibkey (keys %bib) {
$text =~ s/\\cite\{$bibkey\}/$bib{$bibkey}<\/ref>/s;
$text =~ s/\\cite\{$bibkey\}//sg;
}
# get rid of extra bibliography related commands
$text =~ s/\\bibliographystyle\{.*?\}//;
$text =~ s/\\bibliography\{.*?\}//;
# get rid of all float code
$text =~ s/\\begin\{figure\}.*?\\end\{figure\}/::FIGURE DELETED/sg;
$text =~ s/\\begin\{table\}.*?\\end\{table\}/::TABLE DELETED/sg;
$text =~ s/\\begin\{tabular\}.*?\\end\{tabular\}/::TABLE DELETED/sg;
# list environments - nested not supported yet
do {
$text =~ s/\\begin\{enumerate\}(.*?)\\item(.*?)\\end\{enumerate\}/
\\begin\{enumerate\}$1\#$2\\end\{enumerate\}/sg;
$e=$2;
} while (defined($e));
$text =~ s/\\begin\{enumerate\}//sg;
$text =~ s/\\end\{enumerate\}//sg;
do {
$text =~ s/\\begin\{itemize\}(.*?)\\item(.*?)\\end\{itemize\}}/\\begin\{itemize\}$1\*$2\\end\{itemize\}/sg;
$e=$2;
} while (defined($e));
$text =~ s/\\begin\{itemize\}//sg;
$text =~ s/\\end\{itemize\}//sg;
# strip extra newlines and rm space at the beginning and end (this better be the last thing in the code)
$text =~ s/^\s*(.*?)\s*$/$1/sg;
$text =~ s/[ \t]*\n[ \t]*\n\s*/\n\n/g;
return $text;
}
sub convert_font{
my $text = shift;
# convert bold and italic and sc
# there should be nested matching here
$text =~ s/\{\s*\\bf\s*(.*?)\s*\}/$1/sg;
$text =~ s/\{\s*\\sc\s*(.*?)\s*\}/$1/sg;
$text =~ s/\{\s*\\it\s*(.*?)\s*\}/$1/sg;
$text =~ s/\{\s*\\em\s*(.*?)\s*\}/$1/sg;
$text =~ s/\\emph\s*\{(.*?)\}/$1/sg;
return $text;
}
- Oleg's routines to deal with the web form
sub print_head {
print '