#!/usr/bin/perl # bibelot.pl $|++; my $VERSION = "0.94"; my $URL="http://sourceforge.net/projects/bibelot"; # Format ASCII text, esp. Project Gutenberg (http://www.promo.net/pg) etexts, # into a PalmDoc PDB file. # # # # Copyright (C) 2000,2001 John Fulmer # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A full copy of the GNU Public License may be found at: # # http://www.gnu.org/copyleft/gpl.html # # # # This program was written using documentation and structures borrowed # from Paul J. Lucas' 'txt2pdbdoc' (http://www.best.com/~pjl/software.html) # and documentation from the Pyrite website # (http://www.pyrite.org/etext/format.html). Also, 'pdbdump' was invaluable in # troubleshooting format problems. # # Some of the header structures were borrowed, but the programming is my fault. # If it breaks, you keep both pieces. But let me know. I'm especially interested # in formatting problems, and trying to track down all the different cases I # can. # # Oh, and what is a 'bibelot'? # See http://www.dictionary.com/cgi-bin/dict.pl?term=bibelot or your nearest # dictionary. # # # jf # # # Version History: # # .01 Initial (ugly) version # # .02 -Partial re-write, made more modular # -Added option to turn compression off (-c) # -Added verbose (-v) # -Added option to set document name (-l) # -Added option to disable document formatting (-f) # -Added usage message (-h) # -Improved compression slightly # # # .03 -Added 'Project Gutenberg' (-g) mode, which sets # the beginning of actual text as a bookmark. # -Now adds two NULL characters between the PDB record # headers and the first record (Record 0). # -Initial bookmark support. # # .04 10/4/00 -Various cleanups # -Added filename sanity checks, and read/write checking # -Improved text formatting efficiency. # -Fixed bug that didn't collapse whitespace correctly # -Fixed off-by-one bug if forcing line lengths. # -Fixed incorrect use of 'pack' that required # 'no strict'. # -Gutenberg mode now sets chapter bookmarks, if able. # -Added dynamic bookmark support (-b). # Place text to bookmark # in between angle brackets (<>). The script will # search for the first instance of the text, and # create a bookmark using the text as the bookmark # name. One bookmark per line, please. # # For instance, let's see you wanted a bookmark # at text that says "Fit The First". At the bottom # of the origional text file, on a blank line, # place a "". The script will # Bookmark the first instance of "Fit The First" # in the document, and erase the "" # at the bottom of the file. The text is case # sensitive. # # -Adjusted 'smart' format function # # .5 10/6/00 -Work around (bug in Perl5?) where the :^ascii: # regex class was matching "[", and stripping it # from text. # -changed version number to match Freshmeat announcement # (whoops) # -removed spurious 's' option from getopts() # # .6 -re-added option to turn off 'smart' format mode. (-s) # (Found out what that spurious 's' was for) # -added code to rejoin words split by hyphens at eol. # # .7 12/21/00 -automagically grab title (if not specified with -t) # from file in 'Project Gutenberg' mode. Often # (but not always) the title is specified on the # first line of a text from Project Gutenberg. # Grab it, truncate (if necessary),plunk it into # the DOC title field. # -added -d option to turn off hypen correction # -modified help option and added to opening comments. # -verbose now echo's detected title. # -a few code cleanups. # # .8 1/4/00 -match more title entries from Project Gutenberg # -now hosted at Sourceforge, and development versions # in CVS. # -more (minor)tweaks to the smart formatting, to help # with badly formatted text with short lines early on. # -fixed bug that didn't strip out non-ascii chars. # Yes, Virginia, octal DOESN'T stop at 255..... # # .9 1/9/00 -Code cleanups. # -Strips control characters from title text. # -More sanity checks on output filename. If infile or # outfile are NULL, treat them as stdin/stdout. # -You can now use '-' to specify stdin or stdout # -Better compression. Thanks to Antaeus Feldspar, the # compression algorithm is more efficient. It also makes # bibelot a bit slower (6 seconds vs 4.5 seconds on an # average book file on my system). The efficiencies only # add up to %1-2 better compression, but that's 4-10k for # many books, which can add up. # -Compression error debugger, also courtesy Antaeus # Feldspar. Turn on by the $error_check global variable. # -New switch, 'o', to seed smart formatting offset. The # smaller the number, the better (maybe) the formatting, # but more badly chopped lines. Default is 20. # -Handle another different title for PG mode. # # .91 -Minor code change, Palm desktop for Windows demands # a timestamp in the PDB header. I faked up one # (0x11111111) for now. In the process, I also learned # that ActiveState Perl build 623 doesn't work with # with bibelot, something to do with a difference in # string handling. ActiveState's problem, if you ask me. # I would be interested if bibelot works on anything else # besides Linux, though... I DO know that nsperl 5.004 # for dos works fine. # # .92 2/26/01 -More minor changes for DOS and Windows versions of perl # now it actually works. Uses binmode() for output if # DOS/Win32 platform. (Are you happy now, Kyle?!?) # -Added check for common DOS and Win32 versions of # perl, currently only looks for ActiveState's Perl # for Win32, others probably work. # -Disabled filename sanity checks for Win32 platforms. # -Accidentally left the compression error checking on. # Should be MUCH faster now. # # # .93 4/02/01 -Condensed the title match regex to one line. # -Fixed problem with spaces in title with '-t' # # .94 5/18/01 -Added 8-bit support. This removes the check for high # byte control characters, so don't blame me if your # Palm blows up. :) # # # # Pragma goes HERE # # 'Use strict' so that we have to declare variables. Not a bad practice. # use strict; # # Global Variables go HERE # my $total_len = 0; # Total length of uncompressed text my $buff = ""; # Temporary buffer space my $header = ""; # PDB headers to preappend my $is_compr = 1; # '0' = no, '1' = yes my $is_verbose = 0; # If set, output debug info. my $dont_format = 0; # Don't format the text my $infile = "-"; # file to read, or STDIN (-) my $outfile = ">-"; # file to write to, or STDOUT (>-) my $line_len = 0; # If set, force linefeeds at $line_len my $pdb_name = "PalmDoc Document"; # Name of PalmDoc file my $col_position = 0; # Global column position for format my @block_size; # Compressed size of all text blocks my $avg_line_num = 0; # The next three are for use in my $avg = 0; # format_text()'s formatting logic. my $avg_total = 0; my $is_pg = 0; # 'Project Gutenberg' mode. Adds # A bookmark autoscan tag to the end of # the text to indicate the start of # the real text. my $pg_pos = 0; my $bookmark_buff = ""; # Temporary buffer for bookmark my $bookmark_num = 0; # Total number of bookmarks my $is_bookmark = 0; # Switch for bookmark mode my $is_smart = 1; # Switch to turn off 'smart' format my $title_set = 0; # Is title name set? my $pg_title; # Title for PG mode my $pg_title_set = 0; # Found $pg_title my $is_hyphen_off = 0; # Switch to turn off hypen correction my $sformat_offset = 20; # Smart format offset my $error_check = 0; # Compr. error checker my $is_evil = 0; # Check for Microsoft OS's ################################################################# # # # Main program # # # ################################################################# # # Process 'getopts' and return global variables # proc_opts() || die "Arg! Confusing command options (should never happen!)\n"; # # Read text from input source into buffer. Yes, all of it. And format it. # $buff = read_text() || die "Arg! Error in reading text (should never happen!)\n"; # # Create optional bookmarks # if ($is_pg || $is_bookmark) { $bookmark_buff = find_bookmarks($buff); } # # Compress, if necessary. # if ($is_compr) { $buff = compr_text($buff); } # # Generate PDB headers and record 0, and pre-append them to the buffer. # $buff = pdb_header() . $buff; # # Write optional bookmarks # if ($is_pg || $is_bookmark) { $buff .= $bookmark_buff; } # # Write text out # write_text($buff)|| die "Arg! Error in writing file (should never happen!)\n"; ; # Done. Wasn't that easy. ################################################################# # # # Get and process command line options # # # ################################################################# sub proc_opts { # # Local Variables # my $num_args; # # Turn off 'strict' for getopts(). # no strict; # # getopts() is your friend # use Getopt::Std qw(getopt getopts); getopts('l:vdht:cfgbso:') || die "Invalid Argument\n"; # # Force line length? # if ($opt_l) { # Not empty unless ($opt_l =~ /\D/) { # And only contains digits $line_len = int($opt_l); } else { # is alpha or otherwise die "Invalid line length.\n"; } } # # Help text # if ( $opt_h ) { print "\nusage: $0 [OPTIONS] \n\n" . "Formats text to PalmDoc format.\n" . "$URL\n" . "Version $VERSION\n\n" . "options:\n" . "\t-h\t\tthis message\n" . "\t-c\t\tturn file compression OFF\n" . "\t-v\t\tverbose\n" . "\t-t \"title\"\tdocument title\n" . "\t-f\t\tdon't format text\n" . "\t-l\t\tforce line width to bytes\n" . "\t-g\t\tEnable 'Project Gutenberg' mode\n" . "\t-b\t\tEnable Dynamic Bookmark mode\n" . "\t-d\t\tTurn off hyphen correction\n" . "\t-s\t\tTurn off 'smart' format\n" . "\t-o\t\tOffset for 'smart' format (default '20')\n\n" . "Use '-' or omit filenames to indicate STDIN or STDOUT.\n\n"; exit 0; } # # Set document name # if ( $opt_t ) { $opt_t =~ s/[\000-\011\013-\037\177-\377]//g; #strip control chars $opt_t =~ s/\s+/ /g; if ( (length $opt_t) > 31 ) { $pdb_name = substr($opt_t,0,28) . "..."; $title_set = 1; } else { $pdb_name = $opt_t; $title_set = 1; } } # # Set offset for 'smart' filtering. The larger the number, the more formatted # text it may miss (due to the shorter length), but you will get fewer false # positives due to short lines. # if ($opt_o) { # Not empty unless (($opt_o =~ /\D/) || # And only contains digits (int($opt_o) > 65)) { # Offsets greater than 65 are worthless $sformat_offset = int($opt_o); } else { # is alpha or otherwise die "Invalid offset.\n"; } } if ($opt_v) { $is_verbose = 1; } # Maximum Verbosity! if ($opt_c) { $is_compr = 0; } # Turn off compression? if ($opt_f) { $dont_format = 1; } # Don't format text if ($opt_g) { $is_pg = 1; } # Project Gutenberg mode if ($opt_b) { $is_bookmark = 1; } # Bookmark mode if ($opt_s) { $is_smart = 0; } # Turn off 'smart' format if ($opt_d) { $is_hypen_off = 1; } # Turn off hyphen correction # # Turn back on strict # use strict; # # Check for the 'Evil' OS...or OS/2 or whatever... # if ($^O =~ /MSWin32|dos|os2/i) { $is_evil = 1 } # # Everything left should be file names, or an error $num_args = @ARGV; # # use filenames or STDIN/STDOUT? # if ($num_args == 0) { # No args? $is_verbose = 0; # Turn off verbosity # defaults are good for STDIN/STDOUT }elsif ($num_args == 1) { # 1 arg? Must be for infile $infile = sanitize($ARGV[0], "input"); $is_verbose = 0; # Turn off verbosity }elsif ($num_args == 2){ # 2 args? Must be both infile/outfile $infile = sanitize($ARGV[0], "input"); $outfile = sanitize($ARGV[1], "output"); }else { # More? Error and die! die "Too many filename arguments on command line.\n"; } # # Return 'success' code # return(1); } ################################################################# # # # Read text from input into buffer. # # # ################################################################# sub read_text { # # Local Vars HERE # my $in; # Buffer to store text in. open (IN, "$infile") || die "Can't open $infile: $!\n"; while () { # # Format and add each line to $in # if ($dont_format) { # Don't format text $in .= $_; } else { $in .= format_text($_); } } close (IN); # # Set $total_len for header generation # $total_len = length $in; return ($in); } ################################################################# # # # Write text out to file. # # # ################################################################# sub write_text { open (OUT, ">$outfile") || die "Can't open $outfile: $!\n"; if ($is_evil) { binmode(OUT) } # Make MS OS's happy print OUT $_[0]; # Output the file close (OUT); return (1); } ################################################################# # # # Format text to a more PalmDoc reader # # friendly format. # # # ################################################################# sub format_text { # # Local Vars HERE # my $line_buff = ""; # Temorary buffer to format text in my @line; my $x; my $y; my $testchar; my $newx = ""; # # Function to take a line of text (in $_[0]), strip out extra # linefeeds and such and, if necessary, add linefeeds to give # max -l # chars per line. Must also maintain a global (col_position) # to make sure that when this function is reentered, # we know on what column position we left off last time. # # # Grab title from text, first one that matches, wins. # if (($is_pg) && !($pg_title_set)){ $pg_title = $_[0]; if ( $pg_title =~ s/.+?Project Gutenber(g|g's) Etext( | of) (.+?)(by|,|,by|\*|\.).+/$3/i ) { chop $pg_title; if ( (length $pg_title) > 31 ) { $pg_title = substr($pg_title,0,28) . "..."; } $pg_title_set = 1; } } # # Assign input string to @line, remove ending newlines, split by whitespace # chomp; @line = split(/\s+/, $_[0]); # # Attempt at some formatting logic. If average line size is somewhat over 80, # we can safely assume that the file is not formatted, and any linefeeds we # find should stay right where they are, since they are probably formatting. # # If we find the average size is ~ 80 or under, but the linefeed comes somewhat # under the average size, we will guess the linefeed stays. # # if (length($_[0]) > 30) { # Ignore short lines $avg_total += length($_[0]); $avg = $avg_total / ++$avg_line_num; } # # Check each word, strip any whitespace characters, and insert # a newline before the word if it would cross the $line_len boundary. # # Then add the word to the output string. # # Note that some text may be mangled, if it depends on hard returns for # formatting, or double spaces. # foreach $x (@line) { if ($x) { if ($is_smart) { $x =~ s/\s+?|[\000-\011\013-\037]//g; } # Ixnay spaces, control chars # tab/space formatted text will # certainly break. # # If forcing to a specific line length, check to see if adding the word # and space will overflow the specified line length. If so, add newline first # and reset the col_position counter. # if ( $line_len && (((length $x) + $col_position + 1) > $line_len) ) { $line_buff .= "\n"; $col_position = 0; } # # Add word + space to output buffer, then increment the column position # $line_buff .= $x . " "; $col_position += (length $x) + 1; } } unless ($is_hyphen_off) { $x = length $line_buff; $line_buff =~ s/-\s+\Z//; # fix hypen separated words at # the end of lines. $col_position += $x - (length $line_buff); # Adjust for hypen removal } # # If the output string contains no words, assume a double spaced line # otherwise, replace the final newline. # if ( $avg > 85 ) { $line_buff .= "\n"; # Preserve linefeeds if $col_position = 0; # file appears to } # already be stripped. if (($line_buff eq "") && ($col_position != 0)) { # Double space $line_buff = "\n\n"; $col_position = 0; } elsif ($line_buff eq "") { # Single space $line_buff = "\n"; $col_position = 0; # # This is some VooDoo that seems to work well. So far. # # What it does is this: Using average line size information at the top of this # function, it assumes that lines that are less than the average - # $sformat_offset AND if the current column position (where the linefeed would # go) is less than the average - $sformat_offset , it assumes that it is a # formatted line, and inserts the linefeed. The further into the file it goes, # the more accurate it should be. # # I can imagine all kinds of places where this would break horribly, # but it would break anyway without this bit's help. # } elsif ( ((length $line_buff) <= $avg - $sformat_offset ) && ($col_position <= $avg - $sformat_offset ) && ($avg < 85) && ($is_smart) ) { # Assume formatted text $line_buff .= "\n"; $col_position = 0; } return ($line_buff); } ################################################################# # # # Generate the PDB headers and Record 0 # # # ################################################################# sub pdb_header { # # Local Vars HERE! # # # Some constants # my $COUNT_BITS = 3; my $DISP_BITS = 11; my $DOC_CREATOR = "REAd"; my $DOC_TYPE = "TEXt"; my $RECORD_SIZE_MAX = 4096; # 4k record size my $dmDBNameLength = 32; # 32 chars + 1 null my $pdb_rec_offset; # PDB record offset my $header_buff = ""; # Temporary buffer to build the headers in. my $x; my $y; # # PDB header # # We're going to set some variables and then use 'pack' to put them into a # buffer. # # Here's the format in C (Dword = 4 bytes, Word = 2 bytes) # #typedef struct { /* 78 bytes total */ # char name[ dmDBNameLength ]; # Word attributes; # Word version; # DWord create_time; # DWord modify_time; # DWord backup_time; # DWord modificationNumber; # DWord appInfoID; # DWord sortInfoID; # char type[4]; # char creator[4]; # DWord id_seed; # DWord nextRecordList; # Word numRecords; #} pdb_header; my $pdb_header_size = 78; my $pdb_attributes = 0; my $pdb_version = 0; my $pdb_create_time = 0x11111111; # Palm Desktop demands my $pdb_modify_time = 0x11111111; # a timestamp. my $pdb_backup_time = 0; my $pdb_modificationNumber; my $pdb_appInfoID = 0; my $pdb_sortInfoID = 0; my $pdb_type = $DOC_TYPE; my $pdb_creator = $DOC_CREATOR; my $pdb_id_seed = 0; my $pdb_id_nextRecordList = 0; my $pdb_numRecords = (int ($total_len / 4096)) + 2; # +1 for record 0 # +1 for fractional part if ($is_pg || $is_bookmark) { $pdb_numRecords += $bookmark_num; } # # Pack that header! # # # Set $pdb_name to detected name, unless forced using -t. # if ( !($title_set) && ($is_pg) && ($pg_title_set)) { $pdb_name = $pg_title; } if ($is_verbose) { print "Document Title: $pdb_name\n"; } my $pdb_header = pack("a32nnNNNNNNa4a4NNn",$pdb_name,$pdb_attributes, $pdb_version,$pdb_create_time, $pdb_modify_time,$pdb_backup_time, $pdb_modificationNumber,$pdb_appInfoID, $pdb_sortInfoID,$pdb_type,$pdb_creator, $pdb_id_seed,$pdb_id_nextRecordList, $pdb_numRecords); # # Sanity check # if ( (length $pdb_header) != 78) { die "pdb_header malformed\n"; } # # Create the PalmDoc header # # # Here's the format in C # # struct doc_record0 { /* 16 bytes total */ # Word version; /* 1 = plain text, 2 = compressed text */ # Word reserved1; # DWord doc_size; /* uncompressed size in bytes */ # Word num_recs; /* not counting itself */ # Word rec_size; /* in bytes: usually 4096 (4K) */ # DWord reserved2; # }; my $doc_header_size = 16; my $doc_version = $is_compr + 1; # Compression on by default my $reserved1 = 0; my $doc_doc_size = $total_len; my $doc_rec_size = 4096; my $doc_num_recs = (int ($total_len / 4096)) + 1; my $doc_reserved2 = 0; # # Pack Record 0 # my $doc_header = pack("nnNnnN",$doc_version,$reserved1,$doc_doc_size, $doc_num_recs,$doc_rec_size,$doc_reserved2); # # Sanity check! # if ( (length $doc_header) != 16) { die "doc_header malformed\n"; } # # Template for the PDB record headers # # Docs are REAL fuzzy on this. # # # Format in C # #struct pdb_rec_header { /* 8 bytes total */ # DWord offset; # struct { # int delete : 1; # int dirty : 1; # int busy : 1; # int secret : 1; # int category : 4; # } attributes; # char uniqueID[3]; #} my $pdb_rec_header_size = 8; my $pdb_rec_attributes = 0x40; # We'll fake this, 0x40 = 'dirty' my $pdb_rec_uniqueID = 0x3D0; # Simple increment # # Since we need to so a bunch of these, we'll use this as a template # my $pdb_rec_header_template = "Nccn"; # # Generate and write headers # # # PDB record headers are generated and placed at the head of the file. # The number of headers required is Total_File_Bytes / 4096 + 1 # The +1 being for the fractional part left over. # # Someone could have documented this better. :) # # For the record, the file format is: # # PDB Header (78 bytes) # PDB Record Headers (8 bytes) # . . . # . . . Repeat N + B + 1 times, where N is # of 4096K blocks # . . . The +1 is for record 0 (DOC header) # . . . B = # of bookmarks # (DB Records) # 0x0 0x0 Two NULLS # Record 0 (PalmDoc Header) # Text # . . . # . . . # . . . # Optional Bookmark records # . . . # EOF # # $pdb_rec_offset = $pdb_header_size + (($pdb_numRecords)* $pdb_rec_header_size) + 2; # # Write PDB header, and PDB rec header for record 0 # $header_buff = $pdb_header . pack($pdb_rec_header_template, $pdb_rec_offset, $pdb_rec_attributes, "a",$pdb_rec_uniqueID ); $pdb_rec_offset += $doc_header_size; # Add offset for doc_header if ($is_pg || $is_bookmark) { $pdb_numRecords -= $bookmark_num;} for ($x = 0; $x < $pdb_numRecords - 1; $x++) { # -1 for rec 0 header added above # # If we aren't compressing, every other block besides 0 is guarenteed to be # $RECORD_SIZE_MAX # if (! $is_compr && $x > 0 ) { $block_size[$x] = $RECORD_SIZE_MAX; } $pdb_rec_offset += $block_size[$x]; ++$pdb_rec_uniqueID; $header_buff .= pack($pdb_rec_header_template,$pdb_rec_offset, $pdb_rec_attributes,"a",$pdb_rec_uniqueID); } # # Write optional bookmark pdb headers # if (($is_pg || $is_bookmark) && $bookmark_num) { if ($is_compr){ # Find the end of the text $pdb_rec_offset += $block_size[$x]; } else { $pdb_rec_offset += $total_len % 4096; } for ($y = 0; $y < $bookmark_num; $y++) { $pdb_rec_uniqueID += 10; $header_buff .= pack($pdb_rec_header_template,$pdb_rec_offset, $pdb_rec_attributes,"a",$pdb_rec_uniqueID); $pdb_rec_offset += 20; # Bookmarks are 20 bytes. } } # # Write 2 NULLS # $header_buff .= 0x00 . 0x00; # Write Record 0 $header_buff .= $doc_header; return ($header_buff); } ################################################################# # # # Compress the text # # # ################################################################# sub compr_text { # # # Compresses text with the PalmDoc compression scheme. # # Requires: # $_[0], which contains the entire text to be compressed. # # Returns: $compr_buff, which contains the compressed text. # global @block_size, Array that contains the length of each # compressed block. # 'scalar(@block_size)' should be = to $pdb_numRecords # # Local Vars HERE! # my $total_compr_size = 0; # Final compressed text size my $compr_buff = ""; # Temporary output buffer my $numrecords = (int($total_len / 4096) +1); # Number of blocks to compress. my $x; my $y; my $block_offset; my $block; # Contains the current 4096 byte block of text my $block_len; # Length of current block my $index; # Current scan position in block my $byte; # Char at index (for space + char compression) my $byte2; # Char at index+1 my $test; # Potentially compressible text for # LZ77 compression. my $frag_size; # Current size of above my $frag_size2; # Spare for lazy byte compression my $test2; # spare for above my $test3; # second spare my $pos; # Position (in $block) of reference text # for $test # to compress against. my $pos2; # spare for above my $pos3; # second spare my $back; # $index - pos my $mask; # Bitwise mask to do LZ77 'magic' my $compr_ratio; # Compression ratio my $done; my $comp_block_offset = 0; # The $compr_buff index # block begins. my $FRAG_MAX = 10; # Max LZ77 fragment size my $FRAG_MIN = 3; # Min LZ77 fragment size my $LAZY_BYTE_FRAG = $FRAG_MAX + $FRAG_MIN - 1; $block_size[0] = 0; # Record 0 is already written and # is not compressed. for ($x = 1; $x <= $numrecords; $x++) { $block_offset = ($x - 1) * 4096; $block = substr($_[0],$block_offset, 4096); if ($x >= $numrecords) { # Last block $block = substr($block,0,($total_len % 4096)); } $block_len = length($block); # # Tricky PalmDoc compression scheme. Here's the overview: # # Given a compressed stream, read a byte. # The byte will lie in the following zones: # 0 represents itself # 1...8 type A command; read the next n bytes # 9...7F represents itself # 80..BF type B command; read one more byte # C0..FF type C command; represent "space + char" # # # Sooo. If we just write ASCII text, it will fall within 9..7F or 0 (NULL). # No worries. # # If we write 1...8, the next n bytes will be taken as verbatim. This is # used to mask high byte characters, like accents. I'm not a-using them # at this point. High byte characters get stripped in the text processing # function. # # If we write C0..FF, it will be treated as a space + character. # Write the space, then xOR 0x80, should work. # # 80..BF is tricky. A 16 bit number is written: # Throw away offset bits to copy (+3) # 0 0|0 0 0 0 0 0 0 0 0 0 0|0 0 0 # # So. To encode we keep an index of where we currently are in the file, # and constantly check 3-10 char fragments from $index+frag_size against # the text in $index - 2047 of a 4096 byte block, which contains the # uncompressed text. # # If we find a match, we generate the above gobblygook, (that is, place the # offset into a packed INT (2 bytes), shift it 3 places, then place the number # of bits to copy from the offset in the lower three bits of the INT) place # it in the compressed buffer, increment the index accordingly (# of bits # compressed), and go from there. # Whee. # $index = 0; # # Compression loop # while ( $index < $block_len ) { # # Type 'A', Escape high bytes # $byte = substr($block,$index,1); # Char at $index if ($byte =~ /[\200-\377]/) { # is high bit set? $y = 1; # found at least one! # # Loop to find out how many concurrent high bit characters, max 8 # while ( (substr($block,$index + ($y + 1),1) =~ /[\200-\377]/) && ($y < 8) ) { ++$y; # If found, increment counter } $compr_buff .= chr($y); # Write escape code $compr_buff .= substr($block,$index,$y); # Write text $index += $y; # Increment the index } else { # Real compression routines # # Type 'B', simple LZ77 compression # $frag_size = $FRAG_MIN; # We don't care about anything less $test = substr($block,$index,$frag_size); # pull the current fragment $pos = rindex($block, $test, $index - 1); # check against the buffer # # There's a sliding window of 2047 bytes that we can pull reference # characters from. # if ( ($pos > 0) && ($index - $pos <= 2047) && # Inside our 2047 byte window ( $index < $block_len - $frag_size) ) { # # Found a match! # looking for bigger fragments # for ($y = 4; $y <= $FRAG_MAX; $y++ ) { ++$frag_size ; $test2 = substr($block,$index,$frag_size); $pos2 = rindex($block, $test2, $index - 1); if (($pos2 > 0) && ($index - $pos2 <= 2047) && ($index < $block_len - $frag_size) ) { # found a match! $pos = $pos2; $test = $test2; } else { # no match, go back --$frag_size; last; } } # Sanity check if ($frag_size > $FRAG_MAX) { die "frag_size too big!!!: $frag_size\n"; } # # Now look for an even better match starting at the next position. # This is known as 'lazy matching'. # # NOTE: Why is ($STD_FRAG_MAX + $STD_FRAG_MIN - 1) so magic? # Let's pretend that we are currently at index 1001, looking for matches. # The longest match we can find for the text starting at 1001 has a length of 3. # If the longest match we can find for the text starting at 1002 has a length of # 10, then obviously we get better compression by sending the byte at 1001 out # as a literal and encoding the match found at 1002. But if the longest match # for the text starting at 1002 has a length of 12 ($STD_FRAG_MAX + $STD_FRAG_MIN - 1, # for the PalmDoc spec) then we can encode the match we find for the text at 1001 # and *still* have a match of length 10 for the text starting at 1004. $frag_size2 = $frag_size + 2; $test2 = substr($block,$index + 1, $frag_size2); $pos2 = rindex($block, $test2, $index - 1); if (($pos2 > 0) && ($index - $pos2 <= 2047) && ($index < $block_len - $frag_size2) ) { # found a match for ($y = $frag_size2;$y <= $LAZY_BYTE_FRAG; $y++ ) { # Look for more ++$frag_size2; $test2 = substr($block,$index + 1, $frag_size2); $pos2 = rindex($block, $test2, $index - 1); if (($pos2 > 0) && ($index - $pos2 <= 2047) && ($index < $block_len - $frag_size2) ) { # found a match! } else { # no match, go back --$frag_size2; last; } } if ($frag_size2 < $LAZY_BYTE_FRAG) { # # Lazy byte found; write byte to output and abort compression round # $pos = 0; $compr_buff .= substr($block,$index,1); ++$index; } } if ($pos > 0) { # Did we abort the compression? # # Figure out how far to reach back into the buffer, and create OR mask # that sets the high bit and indicates how big the compressed fragment is. # $back = $index - $pos; $mask = 0x8000 | int($frag_size - 3); # # This line does all the magic; munge and add to output buffer # $compr_buff .= pack("n",int($back << 3) | $mask); $index += $frag_size; } } else { # # Type 'C', Space + Char compress # $byte = substr($block,$index,1); # Char at $index $byte2 = substr($block,$index + 1,1); # next char as well if ( ($byte eq " ") && ($byte2 =~ /[\100-\176]/ ) && ($index <= $block_len - 1)) { # Got a space + char # Set the high bit # and add to output # buffer. $compr_buff .= pack("c", ord ($byte2) | 0x80 ); $index += 2; # Compressed 2 bytes } else { $compr_buff .= $byte; # No compression ++$index; } } } } # # Check for errors in the compression routine then move the counter that # identifies where the compressed representation of the most recently handled # block starts. Turn on by setting $error_check to '1' # if ($error_check) { check_comp($block, substr($compr_buff, $comp_block_offset)); $comp_block_offset = length($compr_buff); } if ( $is_verbose ) { $| = 1; # Flush output buffers $done = int(($x / ((length $_[0]) / 4096)) * 100); if ($done > 100) {$done = 100;} print "\rBlock: $x\tComplete: $done%"; } # # Calculate compressed block sizes, and the total compressed size of the file # $block_size[$x] = (length ($compr_buff)) - $total_compr_size; $total_compr_size = length ($compr_buff); if ( $is_verbose ) { $done = int(($block_size[$x] / $block_len) * 100); print "\tCompressed: $done%"; } $| = 0; # Flush buffers off } # # And one linefeed for Ra.... # if ($is_verbose) { print "\n"; } # # Print some useless information # if ($is_verbose ) { $compr_ratio = ($total_compr_size / $total_len) * 100 ; print "Original Size: $total_len\tCompressed Size: $total_compr_size\t"; printf ("Reduced: %.2f%\n", $compr_ratio); } return ($compr_buff); } ################################################################# # # # Generate Bookmark Headers # # # ################################################################# sub bookmark_rec { # # For now, we are only going to find the end of Gutenberg Project "Fine Print" # text and set it as a bookmark. # # # Local Vars HERE # #my $book_pg = "*END*THE SMALL PRINT!"; my $book_pg = $_[1]; my $book_name = "Bookmark $bookmark_num"; # Default bookmark name if ($_[2]) { $book_name = $_[2];} # Bookmark name was passed to function. my $book_pos = $_[3]; # Offset from start of text to place bm my $book_header_size = 20; # Size of Bookmark header my $book_buff = ""; # Output buffer unless ($book_pos) { # If bookmark position not passed $book_pos = (index($_[0],$book_pg)) + 1; # Index starts at 0, DOC readers 1 } # # Make sure the bookmark name is 15 chars or less # if (length $book_name > 15) {$book_name = substr($book_name,0,12) . "...";} if ($book_pos > 0) { $book_buff = pack("a16N",$book_name,$book_pos); ++$bookmark_num; return ($book_buff); } else { return (""); # No bookmark } } ################################################################# # # # Sanitize filename entries # # # ################################################################# sub sanitize { # # Do various checks on filename entries. Strip control characters, substitute # underscores for most forms of punctuation. # # Recieves filename or path + filename to process, whether is it a input file # or output file, and returns the sanitized version. # # # # Local vars HERE # chomp; # Just to be safe; my $filename = $_ = $_[0]; my $io = $_[1]; my $junk; my $path = $filename; # # If input file, all we care about is that the file exists, is a text file # and readable. For the output file, we want to sanitize the filename, # and make sure the destination directory is writable. # if ($is_evil) { return ($_) } # MS OS. Ack! Game over! No sanity for you! if ($io =~ /in/i) { # Input file if ($filename && $filename ne "-" ) { # and not null or "-" unless ( -e $filename && -r $filename ) { die "Input file IO error: $filename $!\n";} } else { # is null $_ = "-"; # stdin } } elsif ($io =~ /out/i) { # Output file if ($filename) { # and not null $junk = eval "tr#\-/.a-zA-Z0-9#_#cs"; if (m#/#) { # contains a path. $path =~ s#^(.*/).*#$1#; # Strip filename from path unless (-w $path) { die "Output file IO error: Output directory unwritable\n";} } unless ( (!(-e $filename)) || -w $filename ) # Not exist or writable { die "Output file IO error: Output file unwritable\n";} } else { # is null $_ = ">-"; # stdout } } else { # Shouldn't get here. die "Error in sanitize function\n"; } return ($_); } ################################################################# # # # Find Bookmarks # # # ################################################################# sub find_bookmarks { my $pg_bookmark = "*END*THE SMALL PRINT!"; my $pg_bookmark_name = "Text Begins"; my $bookmark_rec = ""; if ($is_pg) { # # Set 'start of text' bookmark # $bookmark_rec .= bookmark_rec($_[0],$pg_bookmark,$pg_bookmark_name); # # Find and set chapter bookmarks # while ($_[0] =~ /\n((?:chapter|chaptre).*?)\s*?\n/gi ) { if ($is_verbose) { print "Bookmark: $1\t\tOffset: " . pos($_[0]) . "\n"; } $bookmark_rec .= bookmark_rec($_[0],"$1","$1",pos($_[0]) - length($1)); } } if ($is_bookmark) { while ($_[0] =~ /\n<(.+?)>/g ) { if ($is_verbose) { print "Bookmark: $1\n"; } $bookmark_rec .= bookmark_rec($_[0],"$1","$1"); } $_[0] =~ s/\n<(.+?)>//g; } return ($bookmark_rec); } ################################################################# # # # Compression Error Checking # # # ################################################################# sub check_comp ($$) { # # Compares the original block to one that's been compressed and decompressed # and reports any places where they differ. # # Requires: # $original_block, the formatted block that was originally sent # to be compressed. Passed to the subroutine as a parameter # # $comp_block, the compressed version of the block # Passed to the subroutine as a parameter # # Returns: Nothing. Output from this routine goes to standard output. # # # # # # Local Vars HERE! # my $original_block = $_[0]; my $comp_block = $_[1]; my $roundtrip_block = ""; # buffer for decompressed text. my $comp_index = 0; # index for start of next element in $comp_block my $element; # element read from the compressed data stream my $bytes_added = 0; # the number of bytes added to the output my $pair_var; # integer used to hold the two-byte packed pair. my $offset; # used if B compression is encountered. my $length; # used if B compression is encountered. my $i; # simple loop variable while ($comp_index < length($comp_block)) { $element = substr($comp_block, $comp_index, 1); # # decompress the next element: # if ((ord($element) == 0x00) || # Literal byte range ((ord($element) >= 0x09) && (ord($element) <= 0x7F))) { # # output the literal byte. # $roundtrip_block .= $element; $bytes_added = 1; $comp_index += 1; } elsif ((ord($element) >= 0x01) && # 'A' (escaped) code range (ord($element) <= 0x08)) { # # Copy next $element bytes literally. (shouldn't happen at this point) # $roundtrip_block .= substr($comp_block, $comp_index + 1, ord($element)); $bytes_added = ord($element); $comp_index += (1 + $bytes_added); } elsif ((ord($element) >= 0x80) && # 'B' (LZ77) code range (ord($element) <= 0xBF)) { # # read the next byte and copy the offset, length pair if it's a B code. # $pair_var = ((ord($element)) << 8) + ord(substr($comp_block, ($comp_index + 1), 1)); $offset = ($pair_var >> 3) & 0x7FF; $length = ($pair_var & 0x07) + 3; # # sanity checks # if (($offset <= 0) or ($offset > 2047)) { # out of window error die "offset is " . $offset . " at index " . (length($roundtrip_block)). "!!!\n"; } if (($length < 3) or ($length > 10)) { # too few/too many die "length is " . $length . " at index " . # bytes to copy error (length($roundtrip_block)) . "!!!\n"; } if ((length($roundtrip_block) - $offset) < 0) { # read before start # of block error die "offset " . $offset . " goes beyond beginning of block!!!\n"; } # # This last one would really be better if a meaningful representation of # *where* in the file/block the offensive offset occurs could be included. # for ($i = 1; $i <= $length; $i++) { $roundtrip_block .= substr($roundtrip_block, (length($roundtrip_block) - $offset), 1); } $bytes_added = $length; $comp_index += 2; } elsif ((ord($element) >= 0xC0) && # 'C' (space + char) code range (ord($element) <= 0xFF)) { # # output the space + character # $roundtrip_block .= " "; $roundtrip_block .= chr(ord($element) & 0x7F); $bytes_added = 2; $comp_index += 1; } } # end while if ( $roundtrip_block ne $original_block) { die "Compressed text does not match original\n"; } } # end of check_comp