#! /usr/bin/perl -w
# Example perl file - extract H1,H2 or H3 headers from HTML files
# Run via:
# perl this-perl-script.pl [-o outputfile] input-file(s)
# E.g.
# perl proto-getH1.pl -o headers *.html
# perl proto-getH1.pl -o output.txt homepage.htm
#
# Russell Quong 2/19/98
require 5.003; # need this version of Perl or newer
use English; # use English names, not cryptic ones
use FileHandle; # use FileHandles instead of open(),close()
use Carp; # get standard error / warning messages
use strict; # force disciplined use of variables
## define some variables.
my($author) = "Russell W. Quong";
my($version) = "Version 1.0";
my($reldate) = "Jan 1998";
my($lineno) = 0; # variable, current line number
my($OUT) = \*STDOUT; # default output file stream, stdout
my(@headerArr) = (); # array of HTML headers
# print out a non-crucial for-your-information messages.
# By making fyi() a function, we enable/disable debugging messages easily.
sub fyi ($) {
my($str) = @_;
print "$str\n";
}
sub main () {
fyi("perl script = $PROGRAM_NAME, $version, $author, $reldate.");
handle_flags();
# handle remaining command line args, namely the input files
if (@ARGV == 0) { # @ARGV used in scalar context = number of args
handle_file('-');
} else {
my($i);
foreach $i (@ARGV) {
handle_file($i);
}
}
postProcess(); # additional processing after reading input
}
# handle all the arguments, in the @ARGV array.
# we assume flags begin with a '-' (dash or minus sign).
#
sub handle_flags () {
my($a, $oname) = (undef, undef);
foreach $a (@ARGV) {
if ($a =~ /^-o/) {
shift @ARGV; # discard ARGV[0] = the -o flag
$oname = $ARGV[0]; # get arg after -o
shift @ARGV; # discard ARGV[0] = output file name
$OUT = new FileHandle "> $oname";
if (! defined($OUT) ) {
croak "Unable to open output file: $oname. Bye-bye.";
exit(1);
}
} else {
last; # break out of this loop
}
}
}
# handle_file (FILENAME);
# open a file handle or input stream for the file named FILENAME.
# if FILENAME == '-' use stdin instead.
sub handle_file ($) {
my($infile) = @_;
fyi(" handle_file($infile)");
if ($infile eq "-") {
read_file(\*STDIN, "[stdin]"); # \*STDIN=input stream for STDIN.
} else {
my($IN) = new FileHandle "$infile";
if (! defined($IN)) {
fyi("Can't open spec file $infile: $!\n");
return;
}
read_file($IN, "$infile"); # $IN = file handle for $infile
$IN->close(); # done, close the file.
}
}
# read_file (INPUT_STREAM, filename);
#
sub read_file ($$) {
my($IN, $filename) = @_;
my($line, $from) = ("", "");
$lineno = 0; # reset line number for this file
while ( defined($line = <$IN>) ) {
$lineno++;
chomp($line); # strip off trailing '\n' (newline)
do_line($line, $lineno, $filename);
}
}
# do_line(line of text data, line number, filename);
# process a line of text.
sub do_line ($$$) {
my($line, $lineno, $filename) = @_;
my($heading, $htype) = undef;
# search for a .... line, save the .... in $header.
# where Hx = H1, H2 or H3.
if ( $line =~ m:()(.*):i ) {
$htype = $1; # either H1, H2, or H3
$heading = $2; # text matched in the parethesis in the regex
fyi("FYI: $filename, $lineno: Found ($heading)");
print $OUT "$filename, $lineno: $heading\n";
# we'll also save the all the headers in an array, headerArr
push(@headerArr, "$heading ($filename, $lineno)");
}
}
# print out headers sorted alphabetically
#
sub postProcess() {
my(@sorted) = sort { $a cmp $b } @headerArr; # example using sort
print $OUT "\n--- SORTED HEADERS ---\n";
my($h);
foreach $h (@sorted) {
print $OUT "$h\n";
}
my $now = localtime();
print $OUT "\nGenerated $now.\n"
}
# start executing at main()
#
main();
0; # return 0 (no error from this script)