#!/usr/bin/perl -w use strict; =pod =head1 NAME B - Unpack a MIME HTML archive =head1 SYNOPSIS B unpacks MIME HTML archives that some browsers (such as Opera) save by default. The file extensions of such archives are .mht or .mhtml. The first HTML file in the archive is taken to be the primary web page, the other contained files for "page requisites" such as images or frames. The primary web page is written to the output directory (the current directory by default), the requisites to a subdirectory named after the primary HTML file name without extension, with "_files" appended. Link URLs in all HTML files referring to requisites are rewritten to point to the saved files. =head1 OPTIONS =over =item B<-h>, B<-?>, B<--help> Print a brief usage summary. =item B<-l>, B<--list> List archive contents instead of unpacking. Four columns are output: file name, MIME type, size and URL. Unavailable entries are replaced by "(?)". =item B<-o> I, B<--output> I If the argument ends in a slash or is an existing directory, unpack to that directory instead of current directory. Otherwise the argument is taken as a path to the file name to write the primary HTML file to. If the output directory does not exist, it is created. =back =head1 SEE ALSO http://www.volkerschatz.com/unix/uware/unmht.html http://www.loganowen.com/mht-rip/ http://sourceforge.net/projects/mhtconv/ =head1 COPYLEFT B is Copyright (c) 2012 Volker Schatz. It may be copied and/or modified under the same terms as Perl. =cut use File::Path; use File::Copy; use File::Glob; use URI; use MIME::Base64; use MIME::QuotedPrint; use HTML::PullParser; use HTML::Tagset; use Getopt::Long; # Add approriate ordinal suffix to a number. # -> Number # <- String of number with ordinal suffix sub ordinal { return $_[0]."th" if $_[0] > 3 && $_[0] < 20; my $unitdig= $_[0] % 10; return $_[0]."st" if $unitdig == 1; return $_[0]."nd" if $unitdig == 2; return $_[0]."rd" if $unitdig == 3; return $_[0]."th"; } { my %taken; # Find unique file name. # -> Preferred file name, or undef # MHT archive name (as a fallback if no name given) # <- File name not conflicting with names returned by previous calls, but which # may exist! sub unique_name { my ($fname, $mhtname)= @_; my ($trunc, $ext); if( defined $fname ) { $fname =~ s/^\s+//; $fname =~ s/\s+$//; $taken{$fname}= 1, return $fname unless $taken{$fname}; ($trunc, $ext)= $fname =~ /^(.*?)(\.\w+)?$/; $ext //= ""; } else { $trunc= $mhtname || "unpack"; $trunc =~ s/\.mht(?:ml?)?$//i; $ext= ""; } for my $suff (1..9999) { $fname= "${trunc}_$suff$ext"; $taken{$fname}= 1, return $fname unless $taken{$fname}; ++$suff; } return undef; } } # Output error message and exit with return value 1. sub abort { print STDERR "$_[0]\n"; exit 1; } # Generate output file directories and primary HTML file name depending on # --output option and primary HTML file name from archive. In case the primary # HTML file is not the first file in the archive, the secondary files directory # is renamed or the files moved on the second call, when the primary HTML file # name is known. # -> Value of --output option (or undef) # Primary HTML file name from MHT archive # Flag indicating if .._files subdirectory should be created # Hash reference to store resulting paths to sub mkfiledir { my ($outputopt, $firsthtmlname, $needfilesdir, $out)= @_; my $prevfilespath= $$out{filespath}; $firsthtmlname= "unpackmht-$$" unless defined $firsthtmlname; if( ! defined $outputopt ) { $$out{toppath}= "."; } elsif( -d $outputopt || $outputopt =~ m!/$! ) { $$out{toppath}= $outputopt; } else { ($$out{toppath}, $firsthtmlname)= $outputopt =~ m!^(.*/)?([^/]+)$!; abort "Empty output file name." unless defined $firsthtmlname; $firsthtmlname .= ".html" unless $firsthtmlname =~ /\./; $$out{toppath}= "." unless defined $$out{toppath}; } $$out{toppath} =~ s!/$!!; $$out{firstout}= "$$out{toppath}/$firsthtmlname"; $$out{filesdir}= $firsthtmlname; $$out{filesdir} =~ s/\.[^.]+$//; $$out{filesdir} .= "_files"; $$out{filespath}= "$$out{toppath}/$$out{filesdir}"; if( defined $prevfilespath ) { return unless $prevfilespath ne $$out{filespath}; return unless -d $prevfilespath; if( ! -d $$out{filespath} ) { File::Copy::move($prevfilespath, $$out{filespath}) or abort "Could not rename secondary files directory."; } else { for (File::Glob::bsd_glob("$prevfilespath/*")) { File::Copy::move($_, $$out{filespath}) or abort "Could not move secondary files."; } } } else { my $createall= $needfilesdir ? $$out{filespath} : $$out{toppath}; if( ! -d $createall ) { File::Path::make_path($createall) or abort "Could not create output directory $createall."; } } } my %opt; my @optdescr= ( 'output|o=s', 'list|l!', 'help|h|?!' ); my %config; my $status= GetOptions(\%opt, @optdescr); if( !$status || $opt{help} ) { print < | --output ] By default, unpacks an MHT archive (an archive type saved by some browsers) to the current directory. The first HTML file in the archive is taken for the primary web page, and all other contained files are written to a directory named after that HTML file. Options: -l, --list List archive contents (file name, MIME type, size and URL) -o, --output Unpack to directory or to file .html Use the command "pod2man unmht > unmht.1" or "pod2html unmht > unmht.html" to extract the manual. EOF exit !$status; } my $firstline= <>; $firstline =~ m!Content-Type: multipart/related;.* boundary=(.*)$! or abort "Can't find Content-Type header - not a MIME HTML file?"; my $boundary= $1; my $endcode= $boundary; $endcode =~ s/\s+$//; my %by_url; my @htmlfiles; my $fh; { $/= "\n--$boundary\n"; <>; my $fileind= 1; while( defined( my $data= <> ) ) { my %headers; while( $data =~ s/^([-\w]+): (.*)\n// ) { $headers{$1}= $2; } $data =~ s/^\n//; $data =~ s/\n--$endcode--\r?\n$/\n/s; my ($type, $origname); if( defined($headers{"Content-Type"}) && $headers{"Content-Type"} =~ /\bname=([^;]*)/ ) { $origname= $1; ($type)= $headers{"Content-Type"} =~ /^(\w+\/\w+)\b/; $type //= ""; } elsif( defined($headers{"Content-Disposition"}) && $headers{"Content-Disposition"} =~ /\bfilename=([^;]*)/ ) { $origname= $1; $type= $origname =~ /\.html?$/i ? "text/html" : ""; } elsif( defined($headers{"Content-Location"}) ) { $origname= $headers{"Content-Location"}; $origname =~ s!^.*/!!; $type= ""; } my $fname= unique_name($origname, $ARGV[0]); if( !defined($headers{"Content-Transfer-Encoding"}) ) { print STDERR "Warning: Encoding of ", ordinal($fileind), " file not found - leaving as-is.\n"; } elsif( $headers{"Content-Transfer-Encoding"} =~ /\bbase64\b/i ) { $data= MIME::Base64::decode($data); } elsif( $headers{"Content-Transfer-Encoding"} =~ /\bquoted-printable\b/i ) { $data= MIME::QuotedPrint::decode($data); } if( $opt{list} ) { $origname =~ s/\s+$// if defined $origname; my $size= length($data); print $fname // "(?)", "\t", $type || "(?)", "\t$size\t", $headers{"Content-Location"} // "(?)", "\n"; next; } $headers{fname}= $fname; if( $headers{"Content-Location"} ) { $headers{url}= $headers{"Content-Location"}; $headers{url} =~ s/\s+$//; $by_url{$headers{url}}= \%headers; } if( $type eq "text/html" ) { $headers{data}= $data; push @htmlfiles, \%headers; } else { mkfiledir($opt{output}, $htmlfiles[0]->{fname}, 1, \%config); $fname= "$config{filespath}/$fname"; open $fh, ">$fname" or abort "Could not create file $fname."; print $fh $data; close $fh; } } continue { ++$fileind; } } mkfiledir($opt{output}, $htmlfiles[0]->{fname}, 0, \%config); my $filesprefix= $config{filesdir} . "/"; my $outname= $config{firstout}; print "primary html output name: $outname\n"; for my $html (@htmlfiles) { my $linksubst= ""; my $p= HTML::PullParser->new( doc => \$html->{data}, "start" => 'text, attr, tagname', "text" => 'text', "end" => 'text' ); while( defined( my $tok= $p->get_token()) ) { my $linkary; my @linkattrs; if( ref($tok->[1]) && ($linkary= $HTML::Tagset::linkElements{$tok->[2]}) && (@linkattrs= grep $tok->[1]->{$_}, @$linkary) ) { for my $attr (@linkattrs) { my $uri= URI->new($tok->[1]->{$attr}); $uri= $uri->abs($html->{url}); $tok->[1]->{$attr}= "$filesprefix" . $by_url{$uri->as_string()}->{fname} if $by_url{$uri->as_string()}; } delete $tok->[1]->{"/"}; $linksubst .= "<$tok->[2] " . join(" ", map("$_=\"$tok->[1]->{$_}\"", keys %{$tok->[1]})) . ">"; } else { $linksubst .= $tok->[0]; } } $outname= "$config{filespath}/$html->{fname}" unless defined $outname; open $fh, ">$outname" or abort "Could not create file $outname."; print $fh $linksubst; close $fh; # for all except the first HTML file: $filesprefix= ""; $outname= undef; }