mobi2html 0000755 0001750 0001750 00000017552 10736000563 011745 0 ustar tompe tompe #!/usr/bin/env perl
# Copyright (C) 2007 Tommy Persson, tpe@ida.liu.se
#
# mobi2html, Copyright (C) 2007 Tommy Persson, tpe@ida.liu.se
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
use FindBin qw($RealBin);
use lib "$RealBin";
use HTML::TreeBuilder;
use Palm::PDB;
use Palm::Doc;
use Image::Size;
use Date::Parse;
use Date::Format;
use Getopt::Mixed;
use MobiPerl::EXTH;
use MobiPerl::Util;
use strict;
use vars qw ($opt_rawhtml $opt_record0 $opt_saveallrecords);
Getopt::Mixed::getOptions ("rawhtml record0 saveallrecords");
my $filename = shift;
my $explodedir = shift;
die "A directory to exlode the mobi file must be specified as second argument"
unless defined $explodedir;
die "File does not exist: $filename" unless -e $filename;
mkdir $explodedir;
my $pdb = new Palm::PDB;
$pdb->Load($filename);
my $name = $pdb->{"name"};
my $version = $pdb->{"version"};
my $type = $pdb->{"type"};
my $creator = $pdb->{"creator"};
my $seed = $pdb->{"uniqueIDseed"};
my $ctime = $pdb->{"ctime"};
my $mtime = $pdb->{"mtime"};
my $sctime = ctime ($ctime);
my $smtime = ctime ($mtime);
print STDERR "Name: $name\n";
print STDERR "Version: $version\n";
print STDERR "Type: $type\n";
print STDERR "Creator: $creator\n";
print STDERR "Seed: $seed\n";
print STDERR "Resdb: " . $pdb->{"attributes"}{"ResDB"} . "\n";
print STDERR "AppInfoDirty: " . $pdb->{"attributes"}{"AppInfoDirty"} . "\n";
print STDERR "ctime: $ctime - $sctime\n";
print STDERR "mtime: $mtime - $smtime\n";
print STDERR "baktime: " . $pdb->{"baktime"} . "\n";
my @records = @{$pdb->{"records"}};
print STDERR "Number of record: " . $#records . "\n";
my $image_index = 0;
my %image_index_to_filename = ();
foreach my $r (@records) {
my $id = $r->{"id"};
my $cat = $r->{"category"};
my $offset = $r->{"offset"};
my $data = $r->{"data"};
my $size = length ($data);
my $filename = "record-$id";
my ($x, $y, $type) = imgsize(\$data);
if (defined $x) {
print STDERR "Record $id - $cat - $offset - $size - $x x $ y\n";
$image_index++;
$image_index_to_filename{$image_index} = "$filename.$type";
open DATA, ">$explodedir/$filename.$type";
binmode (DATA);
print DATA $data;
close DATA;
# print STDERR "SIZE: $x $y\n";
} else {
if (defined $opt_record0 or defined $opt_saveallrecords) {
open DATA, ">$filename";
print DATA $data;
close DATA;
}
}
if (defined $opt_record0) {
exit (0);
}
if ($id == 0) {
parse_record_0 ($data);
}
}
#my @resources = @{$pdb->{"resources"}};
#print STDERR "Number of resources: " . $#resources . "\n";
my $text = $pdb->text;
#{
# local $/;
# $text =~ s/\r//g;
#}
if (defined $opt_rawhtml) {
binmode (STDOUT);
print $text;
}
my %fileposmap;
print STDERR "Looking for filepos\n";
my $cp = 0;
my $len = length ($text);
while ($cp < $len) {
my $s = substr ($text, $cp, 50);
if (substr ($s, 0, 7) eq "filepos") {
if ($s =~ /^filepos=(\d+)/) {
# print STDERR "FILEPOS: $cp - $1\n";
$fileposmap{$1} = 1;
}
if ($s =~ /^filepos=\"(\d+)\"/) {
# print STDERR "FILEPOS: $cp - $1\n";
$fileposmap{$1} = 1;
}
}
$cp++;
}
print STDERR "Found all filepos\n";
my $offset = 0;
print STDERR "Adding name attributes\n";
foreach my $pos (sort keys %fileposmap) {
# print STDERR "NAMEPOS: $pos\n";
my $a = substr ($text, $pos+$offset, 2);
if ($a eq "ignore_unknown (0);
$tree->parse ($text);
$tree->eof ();
#my $tree = HTML::TreeBuilder->new_from_content ($text);
fix_filepos_attributes ($tree);
fix_image_tags ($tree);
my $htmlfile = $filename;
$htmlfile =~ s/\.mobi/.html/;
$htmlfile =~ s/\.prc/.html/;
$htmlfile =~ s/\.pdb/.html/;
open HTML, ">$explodedir/$htmlfile" or die "Could not open file $explodedir/$htmlfile";
print HTML $tree->as_HTML;
close HTML;
sub fix_image_tags {
my $tree = shift;
my @imgel = $tree->find ("img");
foreach my $img (@imgel) {
my $recindex = $img->attr ("recindex");
my $ind = int ($recindex);
my $filename = $image_index_to_filename{$ind};
print STDERR "FIX IMAGE TAGS: $recindex - $ind - $filename\n";
$img->attr ("recindex", undef);
$img->attr ("src", $filename);
}
}
sub fix_filepos_attributes {
my $tree = shift;
my @ael = $tree->find ("a");
print STDERR "Fixing filpos attribute\n";
foreach my $a (@ael) {
my $filepos = $a->attr ("filepos");
if ($filepos) {
$a->attr ("href", "\#$filepos");
$a->attr ("filepos", undef);
## print STDERR "FIX FILEPOS ATTR: $filepos\n";
}
}
}
sub parse_record_0 {
my $rec = shift;
my $palmdocheader = substr ($rec, 0, 16);
parse_palmdoc_header ($palmdocheader);
if ($type eq "BOOK" and $creator eq "MOBI") {
my $mobiheader = substr ($rec, 16);
parse_mobi_header ($mobiheader);
}
}
sub parse_palmdoc_header {
my $data = shift;
my ($version, $length, $nrecords, $recsize, $unknown) =
unpack ("nxxNnnN", $data);
print STDERR "PDHEADER Version: $version\n";
print STDERR "PDHEADER Length: $length\n";
print STDERR "PDHEADER NRecords: $nrecords\n";
print STDERR "PDHEADER Recsize: $recsize\n";
print STDERR "PDHEADER Unknown: $unknown\n";
}
sub parse_mobi_header {
my $data = shift;
my ($doctype, $length, $type, $codepage, $uniqueid, $ver) =
unpack ("a4NNNNN", $data);
my ($exthflg) = unpack ("N", substr ($data, 0x70));
print STDERR "MOBIHEADER doctype: $doctype\n";
print STDERR "MOBIHEADER length: $length\n";
print STDERR "MOBIHEADER type: $type\n";
print STDERR "MOBIHEADER codep: $codepage\n";
print STDERR "MOBIHEADER uniqid: $uniqueid\n";
print STDERR "MOBIHEADER ver: $ver\n";
print STDERR "MOBIHEADER exthflg: $exthflg\n";
if ($exthflg & 0x40) {
my $exth = substr ($data, $length);
parse_mobi_exth ($exth);
}
}
sub parse_mobi_exth {
my $data = shift;
my ($doctype, $len, $n_items) = unpack ("a4NN", $data);
print STDERR "EXTH doctype: $doctype\n";
print STDERR "EXTH length: $len\n";
print STDERR "EXTH n_items: $n_items\n";
my $pos = 12;
foreach (1..$n_items) {
my ($id, $size) = unpack ("NN", substr ($data, $pos));
my $contlen = $size-8;
my ($id, $size, $content) = unpack ("NNa$contlen", substr ($data, $pos));
my $hid = sprintf ("%x", $id);
my $hsize = sprintf ("%x", $size);
if (MobiPerl::EXTH::is_binary_data ($id)) {
$content = MobiPerl::Util::iso2hex ($content);
}
print STDERR "ITEM: $hid $hsize - $id $size - $content\n";
$pos += $size;
}
}
=pod
=head1 NAME
mobi2html - A script to explode a DRM-free MobiPocket file to html
=head1 SYNOPSIS
mobi2html file.mobi unpackdir
=head1 DESCRIPTION
A script to explode a DRM-free MobiPocket file to html.
=head1 OPTIONS
=over 4
=item B<--rawhtml>
Output the unmodified HTML code on STDOUT. Mostly useful for debugging.
=back
=head1 EXAMPLES
mobi2html "Bleak House.prc" unpack
mobi2html "Bleak House.prc" unpack --rawhtml > t.html
=head1 AUTHOR
Tommy Persson (tpe@ida.liu.se)
=cut
html2mobi 0000755 0001750 0001750 00000022755 10736050745 011755 0 ustar tompe tompe #!/usr/bin/env perl
# html2mobi, Copyright (C) 2007 Tommy Persson, tpe@ida.liu.se
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
use FindBin qw($RealBin);
use lib "$RealBin";
use MobiPerl::MobiFile;
use MobiPerl::Opf;
use MobiPerl::Config;
use MobiPerl::LinksInfo;
use HTML::TreeBuilder;
use Getopt::Mixed;
use strict;
use vars qw ($opt_title $opt_author $opt_htmlfile $opt_mobifile $opt_gentoc
$opt_coverimage $opt_noimages $opt_addcoverlink
$opt_prefixtitle);
Getopt::Mixed::getOptions ("title=s author=s htmlfile=s mobifile=s gentoc
coverimage=s noimages addcoverlink
prefixtitle=s");
#
# expand html document with links...
# Generate TOC automatically, guide thingy
# Getting images to work...
# Small image in library, 8 record, 180x240 jpeg
# make testhtmlsgentoc, links in non generatec toc is not working
#
#
# Debian: libpalm-perl
# libimage-size-perl
#
# 8 DWord dwType //pub type: 2=book,3=palmdoc,4=audio,news=257,feed=258,magazin e=259 etc
# C DWord dwCodepage //1252=western, 65001 = UTF8. Better not use anything else
my @filenames = @ARGV;
my $tree = 0;
my %file_to_tree = ();
my %file_to_title = ();
my $namerefindex = 0;
my %file_to_nameref = ();
my $linksinfo = new MobiPerl::LinksInfo;
my $mobifile = "t.mobi";
my $config = new MobiPerl::Config;
$config->add_cover_link (1) if defined $opt_addcoverlink;
$config->no_images (1) if defined $opt_noimages;
$config->cover_image ($opt_coverimage);
$config->author ("Unspecified Author");
$config->author ($opt_author) if defined $opt_author;
$config->title ($opt_title);
$config->prefix_title ($opt_prefixtitle);
my $filename = $filenames[0];
$mobifile = $filename;
$mobifile =~ s/\.html/\.mobi/;
$mobifile =~ s/\.htm/\.mobi/;
if ($#filenames == 0) {
$tree = one_html_file ($filename, $linksinfo);
} else {
if (not defined $opt_title) {
$config->title ("dummycollectiontitle")
}
$tree = get_collection ($config, $linksinfo, @filenames);
}
if (defined $opt_htmlfile) {
open HTML, ">$opt_htmlfile" or die "Could not open html file $opt_htmlfile: $!\n";
my $text = $tree->as_HTML;
$text =~ s/&\;nbsp\;/ \;/g;
print HTML $text;
close HTML;
}
if (defined $opt_mobifile) {
$mobifile = $opt_mobifile;
}
if ($mobifile eq $filename) {
$mobifile .= ".mobi";
}
MobiPerl::MobiFile::save_mobi_file ($tree, $mobifile, $linksinfo, $config);
#
# HTML manipulation functions
#
sub one_html_file {
my $filename = shift;
my $linksinfo = shift;
print STDERR "ONEHTMLFILE: $filename\n";
my $tree = new HTML::TreeBuilder ();
$tree->ignore_unknown (0);
$tree->parse_file ($filename) || die "Could not find file: $filename\n";
$linksinfo->check_for_links ($tree);
my $titleelement = $tree->find ("title");
if ($titleelement and not $config->title ()) {
$config->title ($titleelement->as_trimmed_text ());
}
if (not $config->title ()) {
my $title = $filename;
$title =~ s/\.html//;
$title =~ s/\.htm//;
$config->title ($title);
}
MobiPerl::Util::fix_pre_tags ($tree);
# Fix links
my @refs = $tree->look_down ("href", qr/^\#/);
my @hrefs = ();
my @refels = ();
my %href_to_ref = ();
foreach my $r (@refs) {
$r->attr ("filepos", "0000000000");
my $key = $r->attr ("href");
$key =~ s/\#//g;
push @hrefs, $key;
push @refels, $r;
# $r->attr ("href", undef);
}
my $data = $tree->as_HTML ();
foreach my $i (0..$#hrefs) {
my $h = $hrefs[$i];
my $r = $refels[$i];
my $searchfor1 = "id=\"$h\"";
my $searchfor2 = "= 0) {
#
# search backwards for <
#
while (substr ($data, $pos, 1) ne "<") {
$pos--;
}
## $pos -=4; # back 4 positions to get to