#!/usr/bin/perl -w # $Id: /xmltwig/trunk/tools/ooo2txt/ooo2txt-006 4 2007-03-16T12:16:25.259192Z mrodrigu $ use strict; use XML::Twig; # ----------------------------------------------------------------- # Autor : F. Labbe # privat : fred@frederic-labbe.com # professionnal : frederic.labbe@ch-avranches-granville.fr # : http://ooo2txt.fr.st/ # Modified by mirod # object : convert OpenOffice.org file into ascii # Test : # usage (source) : perl ooo2txt [-on] [-e encoding] [-f field_name] ooo_file # usage (binary) : ooo2txt [-j] [-f field_name] ooo_file # # version : 25/09/2002 my $ooo2txt_version = "0.0.6"; use strict; use Getopt::Std; use XML::Twig; use Archive::Zip qw(:ERROR_CODES); use File::Temp qw/ tempfile/; my $USAGE= "USAGE: $0 [[-f |all] | [-o] | [-n]] [-e ] see http://ooo2txt.fr.st/"; my $OOO_XML_CONTENT= 'content.xml'; my %opt; getopts('onvhe:f:t:', \%opt); die "$0 version $ooo2txt_version\n" if( $opt{v}); die $USAGE, "\n" if( $opt{h}); die "options -o and -f are exclusive\n", $USAGE, "\n" if( (defined( $opt{o}) + defined( $opt{f}) ) > 1); my $zip_name = shift(@ARGV) || die $USAGE; my $zip = Archive::Zip->new( $zip_name) or die "cannot read archive file $zip_name\n"; my $file = $zip->memberNamed($OOO_XML_CONTENT) or die "Can't access data file $OOO_XML_CONTENT in zip.\n"; my $xml = tempfile(); my $status= $file->extractToFileHandle($xml) and die "Extracting $OOO_XML_CONTENT from $zip_name failed\n"; seek( $xml, 0, 0); my %option; my $state={}; # various state information used during parsing; $option{output_encoding}= $opt{e} if( $opt{e}); my $conv= $opt{e} ? XML::Twig::encoding_filter( $opt{e}) : sub { return join '', @_; }; $opt{t}||= 30; $opt{d}||= '.'; my $t; if( $opt{f}) { # output only field(s) $t= XML::Twig->new( %option, twig_roots => { 'text:p' => sub { display_fields( $opt{f}, @_); }, },); } elsif( $opt{o}) { # output only outline $t= XML::Twig->new( %option, twig_roots => { 'text:h' => \&h, },); } else { # output all text $t= XML::Twig->new( %option, twig_roots => { 'text:h' => \&h, 'text:p' => sub { print $conv->( $_->text), "\n"; }, }, ); } $t->parse( $xml); sub h { my( $t, $h)= @_; my $text= $h->text; if( $opt{n} || $opt{o}) { my $text_level= $h->att( "text:level"); if( $text_level) { print "\n"; my $number= current_number( $text_level, $state); $text= $number . $text; } } print $conv->( $text), "\n"; } sub current_number { my( $text_level, $state)= @_; $state->{text_numbering}||= []; my $nb= $state->{text_numbering}; foreach ( $text_level..@$nb) { pop @$nb; } $nb->[$text_level-1]++; return join( '.', @$nb) . " "; } sub display_fields { my( $fields, $t, $p)= @_; my $filter= $fields eq 'all' ? qq{text:text-input} : qq{text:text-input[\@text:description="$fields"]}; while( my $field= $p->first_child( $filter)) { my @children= $p->children; my $child= shift @children; my $text=''; while( $child->before( $field)) { $text.= $child->text; $child->cut; $child= shift @children; } $field->cut; display_field_line( $text, $field->text); } } sub display_field_line { my( $text, $field)= @_; $text = $conv->($text); $field = $conv->($field); my $nb_dots= $opt{t} - length( $text); $nb_dots=0 if( $nb_dots < 0); print $text, " ", $opt{d} x $nb_dots, " ", $field, "\n"; } __END__ =head1 NAME ooo2txt =head1 DESCRIPTION read a Star/Open Office file (only Writer is supported at the moment, C<.sxw>/C<.stw> files) and display the text. =head1 SYNOPSYS ooo2txt doc.sxw # output text (in utf-8) for the document ooo2txt -e ISO-8859-15 doc.sxw # output ISO-8859-15 encoded text for the doc 0002txt -n doc.sxw # output text, titles are numbered 0002txt -e NUM_CUST doc.sxw # output field NUM_CUST for the doc ooo2txt -f all doc.sxw # output all fields for the doc ooo2txt -o doc.sxw # output outline (titles are numbered) =head1 OPTIONS =over 4 =item * C<< -e [encoding] >> output encoding for the text (as per C) =item * C<-n> number titles =item * C<-o> output only an outline of the doc (titles are numbered) =item * C<-f [all|field_name]> output all or a single field from the document =back =head1 TODO format tables properly get the style information to properly number titles =head1 BUGS tables are not displayed properly numbering is very crude =head1 PREREQUISITE Archive::Zip XML::Twig XML::Parser To use the C<-e> option a way to convert encodings is needed (Text::Iconv and Iconv, Encode or Unicode::Strings and Unicode::Map8) =head1 AUTHOR Michel Rodriguez based on work by F. Labbe =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Comments can be sent to mirod@xmltwig.com =head1 SEE ALSO OpenOffice.org: http://www.openoffice.org/ XML::Twig: http://www.xmltwig.com Ooo2txt: http://ooo2txt.fr.st/