Namespaces
Variants
Actions

Difference between revisions of "Help:Normalize EoM references"

From Encyclopedia of Mathematics
Jump to: navigation, search
m
m (→‎Perl script: Now unicode (utf8) letters are considered and correctly collated)
Line 66: Line 66:
  
 
# All those words after the ref descriptor (in brackets) are collected
 
# All those words after the ref descriptor (in brackets) are collected
# which start with a capital followed by a small letter and ended by a
+
# wich start with a capital followed by a small letter and ended by a
 
# comma (like Abel, ), and the first two letters of all these words
 
# comma (like Abel, ), and the first two letters of all these words
 
# are concatenated and used as new reference descriptor [Ab].  
 
# are concatenated and used as new reference descriptor [Ab].  
Line 76: Line 76:
 
use strict;
 
use strict;
 
use utf8;
 
use utf8;
 +
use Encode;
 +
use  Unicode::Collate;
  
 
# get file content:
 
# get file content:
 
undef $/;
 
undef $/;
my $f = <>;
+
my $f = decode('UTF-8', readline STDIN);
 +
 
 +
 
 +
 
 +
 
 +
$f = &transcribe($f);
 +
$f = encode('UTF-8',$f);
 +
#print "...............\n";
 +
print $f;
 +
exit(0);
 +
 
 +
sub transcribe {
 +
    my $f = $_[0];
 +
# for unicode sorting/collating, see below
 +
# Package libunicode-collate-perl required
 +
    my $Collator = Unicode::Collate->new();
  
 
# collect all reference lists in array
 
# collect all reference lists in array
my @refs = ($f =~ /\=+References\=+\s+(\<table\>.*?\<\/table\>)/sg);
+
    my @refs = ($f =~ /\=+References\=+\s+(\<table\>.*?\<\/table\>)/sg);
if ($#refs < 0) {
+
    if ($#refs < 0) {
    print "Error:  no refs: length of refs array: $#refs\n"; exit(1);
+
print "Error:  no refs: length of refs array: $#refs\n"; exit(1);
}
+
    }
  
my %H = (); # Hash for ref keys like [1] etc.
+
    my %H = (); # Hash for ref keys like [1] etc.
my %K = (); # Hash for multiplicity of ref key occurence
+
    my %K = (); # Hash for multiplicity of ref key occurence
  
 
# Array for all initial keys:
 
# Array for all initial keys:
my @items = ($f =~ /\>\[(\w+?)\]\<.*?,/sg);
+
    my @items = ($f =~ /\>\[(\w+?)\]\<.*?,/sg);
foreach(@items) {  
+
    foreach(@items) {  
    my $key = $_;
+
my $key = $_;
    my $y = "";
+
my $y = "";
    # find names after $key and before next comma:
+
# find names after $key and before next comma:
    if ($f =~ /\>\[$key\]\<(.*?),\s*["\']/sg) {
+
if ($f =~ /\>\[$key\]\<(.*?),\s*["\']/sg) {
my $x = $1;
+
    my $x = $1;
# collect first two letters of names  
+
    # collect first two letters of names  
# and concat into $y, will serve as new key:
+
    # and concat into $y, will serve as new key:
my @A = ($x =~ / ([A-Z]\w)[^\.]/sg);
+
#     my @A = ($x =~ / ([A-Z]\w)[^\.]/sg);
foreach(@A) { $y .= $_; }
+
            # required for unicode match:
    }
+
    my @A = ($x =~ / (\p{Lu}\p{Ll})[^\.]/sg);
    $K{$y}++;  
+
    foreach(@A) { $y .= $_; }
    if ($K{$y} > 1) { # append count if bigger than 1
+
}
$H{$key} = $y.$K{$y};
+
$K{$y}++;  
    } else {
+
if ($K{$y} > 1) { # append count if bigger than 1
$H{$key} = $y;
+
    $H{$key} = $y.$K{$y};
 +
} else {
 +
    $H{$key} = $y;
 +
}
 
     }
 
     }
}
 
  
$f =~ s/\[\[#References\|\[(\w+?)\]\]\]/{{Cite|$H{$1}}}/sg;
+
    $f =~ s/\[\[#References\|\[(\w+?)\]\]\]/{{Cite|$H{$1}}}/sg;
$f =~ s/\|\s*\[(\w?)\]\s*\|/|{{Ref|$1}}|/sg;
+
    $f =~ s/\|\s*\[(\w?)\]\s*\|/|{{Ref|$1}}|/sg;
  
  
foreach(@refs) { # process all ref lists:
+
    foreach(@refs) { # process all ref lists:
 
#    print "$_\n";
 
#    print "$_\n";
    my $x = $_;
+
my $x = $_;
    # do table conversion:
+
# do table conversion:
    my $y = &replace($x);
+
my $y = &replace($x);
  foreach(keys %H) {
+
foreach(keys %H) {
$y =~ s/\[$_\]/{{Ref|$H{$_}}}/sg;
+
    $y =~ s/\[$_\]/{{Ref|$H{$_}}}/sg;
 +
}
 +
# sort bib entries by ref keys
 +
my %R = ();
 +
my @A = split(/\|\-/,$y);
 +
foreach(@A) {
 +
    if (/\{\{Ref\|(\w+)\}\}/) { $R{$1} = $_; }
 +
}
 +
my $z = '{|'."\n".'|-';
 +
# foreach(sort keys %R) { $z .= $R{$_}.'|-';  }
 +
foreach($Collator->sort(keys %R)) { $z .= $R{$_}.'|-';  }
 +
 
 +
$z .= "\n\|\}\n";
 +
 +
# do replacements:
 +
# prepend a backslash before each of <>"=()[]+-?|^$*\~
 +
$x =~ s/([\<\>\"\=\(\)\[\]\+\-\?\|\^\$\*\\\~])/\\$1/sg;
 +
$f =~ s/$x/$z/sg;
 
     }
 
     }
     # sort bib entries by ref keys
+
     return $f;
    my %R = ();
 
    my @A = split(/\|\-/,$y);
 
    foreach(@A) {
 
if (/\{\{Ref\|(\w+)\}\}/) { $R{$1} = $_; }
 
    }
 
    my $z = '{|'."\n".'|-';
 
    foreach(sort keys %R) { $z .= $R{$_}.'|-';  }
 
    $z .= "\n\|\}\n";
 
 
 
    # do replacements:
 
    # prepend a backslash before each of <>"=()[]+-?|^$*\~
 
    $x =~ s/([\<\>\"\=\(\)\[\]\+\-\?\|\^\$\*\\\~])/\\$1/sg;
 
    $f =~ s/$x/$z/sg;
 
 
}
 
}
 
#print "...............\n";
 
print $f;
 
 
exit(0);
 
  
 
## converts bib table from html to wiki structure
 
## converts bib table from html to wiki structure
Line 149: Line 167:
 
## cf. http://en.wikipedia.org/wiki/Help:Table#Pipe_syntax_tutorial
 
## cf. http://en.wikipedia.org/wiki/Help:Table#Pipe_syntax_tutorial
 
     my $f = $_[0];
 
     my $f = $_[0];
 +
    $f =~ s/\n/ /sg;
 
     # replace data cells (last | still to be removed!):
 
     # replace data cells (last | still to be removed!):
     $f =~ s/\<td (.*?)\>(.*?)\<\/td\>/|$1|$2|/sgi;
+
     $f =~ s/\<td\s(.*?)\>(.*?)\<\/td\>/|$1|$2|/sgi;
 
     # replace row (consider removal of last | from above):
 
     # replace row (consider removal of last | from above):
 
     $f =~ s/\<tr\>(.*?)[\s\|]*\<\/tr\>\s*/|\-\n$1\n/sgi;
 
     $f =~ s/\<tr\>(.*?)[\s\|]*\<\/tr\>\s*/|\-\n$1\n/sgi;
Line 164: Line 183:
 
     return $f;
 
     return $f;
 
}
 
}
 +
 +
 
</pre>
 
</pre>

Revision as of 13:17, 20 February 2012


Perl script to normalize old EoM reference tables

The perl script below can be used to transcribe old <table> based reference lists into wikipedia table style, thereby using reference descriptors based on the authors' names, ordering tables alphabetically by authors' names, and installing links and anchors using Template:Ref and Template:Cite.

If an external editor is used to edit EoM pages, then its application should be easy. For example, if the external editor is emacs, and if the perl script code below is stored in some file

~/encyclopedia/references/eom_refs.pl

then, after loading the page code into emacs (for example by using the "It's All Text" add-on), the following code (below named Emacs macro) inserted in the user's .emacs file will reduce the application of the perl script to just one keystroke "Ctrl ," (that is, simultaneously press the control key and the comma key).

Emacs macro

;; Requires filter eom_refs.pl to rewrite table references into
;; wikipedia style tables
;; 

(defun eom-references () 
  (interactive)
  (let ( (coding-system-for-read 'utf-8) 
	 (coding-system-for-write 'utf-8)
	 )
    (shell-command-on-region 
       (point-min) (point-max) 
       "~/encyclopedia/references/eom_refs.pl" nil t)
    )
  )

(global-set-key (kbd "C-,") 'eom-references)

Perl script

#!/usr/bin/perl -w

# This script operates like a filter, i.e, it reads a file from
# standard input, rewrites its content and writes the transcript to
# standard output.

# It rewrites eom reference tables into wikipedia table style. It uses
# the Ref and Cite templates.

# It replaces the reference descriptors (like [1], [2],
# etc. by the first two letters of the author names like [Ab], [Ha],
# [KuZi] etc.  and orders the tables alphabetically with respect to
# these descriptors.  Moreover, in the tables, anchors are set by
# {{Ref|Ab}} using the Ref template while, within the text, references
# {{Cite|Ab}} to these anchors are installed by the Cite template.

# Author names are retrieved by the following heuristic: 

# All those words after the ref descriptor (in brackets) are collected
# wich start with a capital followed by a small letter and ended by a
# comma (like Abel, ), and the first two letters of all these words
# are concatenated and used as new reference descriptor [Ab]. 

# In case of multiplicities of such strings the second, third, ... gets a
# '2','3', ... appended like [Ab2], [Ab3] etc.


use strict;
use utf8;
use Encode;
use  Unicode::Collate;

# get file content:
undef $/;
my $f = decode('UTF-8', readline STDIN);




$f = &transcribe($f);
$f = encode('UTF-8',$f);
#print "...............\n";
print $f;
exit(0);

sub transcribe {
    my $f = $_[0];
# for unicode sorting/collating, see below
# Package libunicode-collate-perl required
    my $Collator = Unicode::Collate->new();

# collect all reference lists in array
    my @refs = ($f =~ /\=+References\=+\s+(\<table\>.*?\<\/table\>)/sg);
    if ($#refs < 0) {
	print "Error:  no refs: length of refs array: $#refs\n"; exit(1);
    }

    my %H = (); # Hash for ref keys like [1] etc.
    my %K = (); # Hash for multiplicity of ref key occurence

# Array for all initial keys:
    my @items = ($f =~ /\>\[(\w+?)\]\<.*?,/sg);
    foreach(@items) { 
	my $key = $_;
	my $y = "";
	# find names after $key and before next comma:
	if ($f =~ /\>\[$key\]\<(.*?),\s*["\']/sg) {
	    my $x = $1;
	    # collect first two letters of names 
	    # and concat into $y, will serve as new key:
#	    my @A = ($x =~ / ([A-Z]\w)[^\.]/sg);
            # required for unicode match:
	    my @A = ($x =~ / (\p{Lu}\p{Ll})[^\.]/sg);
	    foreach(@A) { $y .= $_;	}
	}
	$K{$y}++; 
	if ($K{$y} > 1) { # append count if bigger than 1
	    $H{$key} = $y.$K{$y};
	} else {
	    $H{$key} = $y;
	}
    }

    $f =~ s/\[\[#References\|\[(\w+?)\]\]\]/{{Cite|$H{$1}}}/sg;
    $f =~ s/\|\s*\[(\w?)\]\s*\|/|{{Ref|$1}}|/sg;


    foreach(@refs) { # process all ref lists:
#    print "$_\n";
	my $x = $_;
	# do table conversion:
	my $y = &replace($x);
	foreach(keys %H) {
	    $y =~ s/\[$_\]/{{Ref|$H{$_}}}/sg;
	}
	# sort bib entries by ref keys 
	my %R = ();
	my @A = split(/\|\-/,$y);
	foreach(@A) {
	    if (/\{\{Ref\|(\w+)\}\}/) { $R{$1} = $_; }
	}
	my $z = '{|'."\n".'|-';
#	foreach(sort keys %R) { $z .= $R{$_}.'|-';  }
	foreach($Collator->sort(keys %R)) { $z .= $R{$_}.'|-';  }

	$z .= "\n\|\}\n";
	
	# do replacements:
	# prepend a backslash before each of <>"=()[]+-?|^$*\~
	$x =~ s/([\<\>\"\=\(\)\[\]\+\-\?\|\^\$\*\\\~])/\\$1/sg;
	$f =~ s/$x/$z/sg;
    }
    return $f;
}

## converts bib table from html to wiki structure
sub replace{ 

## cf. http://en.wikipedia.org/wiki/Help:Table#Pipe_syntax_tutorial
    my $f = $_[0];
    $f =~ s/\n/ /sg;
    # replace data cells (last | still to be removed!):
    $f =~ s/\<td\s(.*?)\>(.*?)\<\/td\>/|$1|$2|/sgi;
    # replace row (consider removal of last | from above):
    $f =~ s/\<tr\>(.*?)[\s\|]*\<\/tr\>\s*/|\-\n$1\n/sgi;
    # no spaces between consecutive |:
    $f =~s/\|\s*\|/||/sgi;
    # replace <table> and </table>:
    $f =~ s/\<table\>/\{|\n/sgi;
    $f =~ s/\<\/table\>/|\-\n|\}/sgi;
    # remove possible | before template calls:
    $f =~ s/\|\s*\{\{/\{\{/sg;
    # remove space before some punctuation:
    $f =~ s/ +([\,\.])/$1/sg;
    return $f;
}


How to Cite This Entry:
Normalize EoM references. Encyclopedia of Mathematics. URL: http://encyclopediaofmath.org/index.php?title=Normalize_EoM_references&oldid=21176