#!/usr/bin/perl use strict; use warnings; use XML::Parser; my $p = XML::Parser->new( Style => "Tree" ); my $input = "life.xml"; my $depth = 0; # For pretty indentation of HTML. my $cache = ''; # Cached formatted text for file output. traverse( $p->parsefile( $input ) ); exit(0); sub traverse { # This subroutine is called recursively to convert an XML list-of-lists-etc # into an HTML directory-of-files-of-lists-of-lists-etc. $depth++; my $node = shift; while ( my ( $element, $child ) = splice @{ $node }, 0, 2 ) { if ( ref $child ) { # This deals with mark-up nodes. my %attr = %{ shift @{ $child } }; # For the sake of consistency, we'll put all the data associated # with this node into %attr. $attr{element} = $element; $attr{taxon} = ucfirst $element; $attr{name} = ucfirst $attr{name}; ( $attr{font} = $element ) =~ s/^(infra|sub|super)//; $attr{font} = 'division' if $attr{font} eq 'phylum'; if ( $attr{element} eq "domain" ) { # Each domain gets its own HTML file. $depth = 0; # Reset depth. $cache = ''; # Empty cache. # Descriptions may contain , so we need to parse these # appropriately. $attr{description} = get_text( $child ); open my $output, ">", "$attr{name}.html" or die "Can't open $attr{name}.html for writing: $!\n"; # Modern Perls allow us to open a filehandle to a lexical # variable. No more local *FILEHANDLE for us... # Boilerplate... print $output <<"THIS"; $attr{name}

$attr{taxon} $attr{name}

$attr{status}

$attr{description}

THIS close $output; } else { # This deals with all nodes below , which need to be # put in HTML lists-of-lists-of-lists-etc. my $padding = ' ' x $depth; # HTML pretty printing. $attr{description} = get_text( $child ); # Again, descriptions may contain , so we need to parse # these appropriately. $cache .= "$padding
  • $attr{taxon} $attr{name}\n"; $cache .= "$padding

    $attr{status}

    \n" if $attr{status}; $cache .= "$padding

    $attr{description}

    \n"; $cache .= "$padding
      \n"; traverse( $child ); $cache .= "$padding
    \n"; $cache =~ s/$padding
      \n$padding <\/ul>\n$//; # If the traversal added no data, we delete the empty list. # There are probably less hack-ish ways of doing this. Note # that this is why we don't just print as we go along. $cache .= "$padding\n"; } } else { # This deals with text nodes. $cache .= get_text( $child ); } } $depth--; } sub get_text { # This subroutine is called whenever we need to parse text, which may have # embedded sections. This cannot be done by traverse, since # this will en-list the Latin names. If we had a DTD, this would be made # explicit by only allowing within text data (and disallowing # nesting). The subroutine can cope with both vanilla text, and # [ '0', 'description', '0', 'more text', 'kingdom', [ ...divisions...] ] # nodes. my $node = shift; my $text = ''; # Cached text. if ( ref $node ) { # Note that because $node is a reference, when we modify it with splice, # we are modifying the data structure that traverse works on. This means # we can splice out text nodes (as with the description), but leave any # element child nodes (like a within an ) for # traverse to deal with. while ( defined $node->[0] && $node->[0] eq '0' ) { # Text is found with '0' as its 'element' name. my ( $marker, $data ) = splice @{ $node }, 0, 2; $text .= $data; while ( defined $node->[0] && $node->[0] eq "latin" ) { # Deal with embedded nodes. my ( $element, $latin ) = splice @{ $node }, 0, 2; $text .= "$latin->[2]"; } } } else { $text = $node; } # Strip unwanted whitespace. $text =~ s/^\s+//sg; # Leading. $text =~ s/\s+$//sg; # Trailing. $text =~ s/\s*\n\t+/ /sg; # Internal linebreaks. You could do something clever with text wrapping # and $depth if you wanted, but we'll take the 'simple' approach. return $text; }