#!/usr/bin/perl -w # Created by Ben Okopnik on Wed Oct 25 18:09:09 EDT 2006 # Processes an mbox, creates the LG mailbag articles; run without args for full documentation use strict; $|++; # use CGI::Carp qw/fatalsToBrowser warningsToBrowser/; use CGI qw/:standard/; ############## USER_CONFIG SECTION ########################## # The author[s] credited for the Mailbag, etc. my $author = ""; # Where to write the KnowledgeBase entries my $tag_kb = "$ENV{LG_ROOT}/data/kb"; # Chop off thread on main page if it's longer than this number of characters my $cutoff = 3500; # The tags to use for delimiting various types of content my $raw_start = '[RAW]'; my $raw_stop = '[/RAW]'; my $private_start = '[PRIVATE]'; my $private_stop = '[/PRIVATE]'; # Three-character subject prefixes (used with a ':' immediately after # them), their related weights (i.e., their order in the main page), and # their section headings. my %sections = ( gaz => [ 0, "Gazette Matters" ], sts => [ 1, "Still Searching" ], tkb => [ 2, "Talkbacks" ], tct => [ 3, "2-Cent Tips" ], gem => [ 4, "LG Gems" ], ); # Titles (HTML and heading) generated for specified output filenames. The # default is "Mailbag". my %titles = ( 'lg_talkback.html' => "Talkback", 'lg_talkback2.html' => "Talkback2", 'lg_talkback3.html' => "Talkback3", 'lg_talkback4.html' => "Talkback4", 'lg_tips.html' => "2-Cent Tips", 'lg_gems.html' => "Selected Gems from Our Mailbag", 'lg_laundrette.html' => "The Linux Launderette", 'lg_launderette.html' => "The Linux Launderette", ); ############## USER_CONFIG SECTION ########################## ##################################################### # This code is a really ugly hack, and I must have # # been smoking really cheap crack when I wrote it. # # It is, however, useful. # ##################################################### # Nifty little procedure: show the docs if the args are wrong. exec "/usr/bin/perldoc $0" unless @ARGV && -f $ARGV[0]; # Get the input filename my $in_file = shift; die "'$in_file' does not exist!\n" unless -f $in_file; die "'$in_file' is not a valid mailbox!\n" unless qx#/usr/bin/file $in_file# =~ /mail text/; # Get the output filename if one is given my $fname = shift if $ARGV[0]; # Fail if that filename doesn't have a ".html" extension die "The filename must have a '.html' extension!\n" if $fname && $fname !~ /\.html$/; # If an optional title is supplied, use it. my $title = shift if $ARGV[0]; # If no output filename is given, make it the default one $fname ||= "lg_mail.html"; if ( ! $title ){ # If the output filename matches one of those defined in the %titles hash, # above, set the $title variable accordingly (it'll be used for the title # and the headings later.) $title = $titles{$fname} if exists $titles{$fname}; # If no match exists, set the title to a default one. $title ||= "Mailbag"; } # Define all HTML entities my @ent = qw/ nbsp iexcl cent pound curren yen brvbar sect uml copy ordf laquo not shy reg macr deg plusmn sup2 sup3 acute micro para middot cedil sup1 ordm raquo frac14 frac12 frac34 iquest Agrave Aacute Acirc Atilde Auml Aring AElig Ccedil Egrave Eacute Ecirc Euml Igrave Iacute Icirc Iuml ETH Ntilde Ograve Oacute Ocirc Otilde Ouml times Oslash Ugrave Uacute Ucirc Uuml Yacute THORN szlig agrave aacute acirc atilde auml aring aelig ccedil egrave eacute ecirc euml igrave iacute icirc iuml eth ntilde ograve oacute ocirc otilde ouml divide oslash ugrave uacute ucirc uuml yacute thorn yuml /; # Build the 'chr => entity' equivalence list using the above my %entity; $entity{chr $_} = "&" . $ent[$_ - 160] . ";" for 160 .. 255; # Add the 'special' characters @entity{ split //, "<>&" } = qw/< > &/; # This sub does all the text processing. Individual email bodies are fed to # it, and it processes them as described in the comments below. sub cleanup { my $body = shift; # Get rid of all preceding whitespace; replace all following whitespace with a single "\n" $body =~ s/\A\s*(.*?)\s*\Z/$1\n/s; # Remove anything that was explicitly marked as private my $ps = () = $body =~ /(\Q$private_start\E)/g; my $pe = () = $body =~ /(\Q$private_stop\E)/g; unless ($ps == $pe){ print "\nERROR: Unequal number of 'PRIVATE' tags in the mail body.\n\n"; exit; } $body =~ s/(?:^|\n)[ ]*\Q$private_start\E[ ]*\n/\n$private_start/g; $body =~ s/\n[ ]*\Q$private_stop\E[ ]*(?:\n|$)/\n$private_stop/g; $body =~ s/\Q$private_start\E.*?\Q$private_stop\E/ [[[Elided content]]] /gsm; my $rs = () = $body =~ /(\Q$raw_start\E)/g; my $re = () = $body =~ /(\Q$raw_stop\E)/g; unless ($rs == $re){ print "\nERROR: Unequal number of 'RAW' tags in the mail body.\n\n"; exit; } $body =~ s/(?:^|\n)[ ]*\Q$raw_start\E[ ]*\n/\n$raw_start/g; $body =~ s/\n[ ]*\Q$raw_stop\E[ ]*(?:\n|$)/\n$raw_stop/g; # Beginning of the biggest 's///' statement I've ever written :))) $body =~ s%(?:^|\Q$raw_stop\E)(.+?)(?:\Q$raw_start\E|$)% my $body1 = $1; # Dump the TAG footer $body1 =~ s#^\+-\+---+\+-\+\nYou've asked.*?mailman/listinfo/tag\n*##gsm; # Get rid of any whitespace preceding and following the body $body1 =~ s/\s*(.*?)\s*/$1/s; # Convert all necessary characters to entities ### WHOOPS... this breaks Unicode characters. Must rethink. # $body1 =~ s/([\240-\377<>&])/$entity{$1}/eg; $body1 =~ s/([<>&])/$entity{$1}/eg; # Get rid of the "--- next part ---" junk $body1 =~ s/-{5,} next part -{5,}$//gsmi; # Toss the 'Attachment was scrubbed' stanza added by Mailman $body1 =~ s/^A[^\n]+ was scrubbed\.\.\.\n.*?U[Rr][Ll]\s?:[^\n]+\n//gsm; # Convert the 'tweaked' "From"s at the beginning of the line so they # don't automatically become
s
		$body1 =~ s/^(?:>|>)From/ From/gsm;
		# Convert .sigs to 
 - they usually use some kind of layout
		# $body1 =~ s#^-- ?\n(([^\n]+\n){1,8})([^\n]*\n){0,5}$#(defined$1&&defined$3)?"
-- \n$1
$3":"$&"#egsm; $body1 =~ s#^(-- \n([^\n]+\n){0,9}([^\n]*\n?)?)\Z#(defined$1)?"
$1\n
":"$&"#sme; # Convert cited email headers to
		$body1 =~ s#^(((Date|From|To|Subject|Newsgroups|User-Agent|X-Rcpt-To|X-Country|X-UIDL|X-Bogosity|X-Mas|Message-ID|Reply-To|CC|Cc):[^\n]+\n)+)#
\n$1
#gsm; # Tricky bit here: since we're going to replace multiple newlines with #

s, and _don't_ want any of those in

 blocks, we're going to
		# insert a _space_ within any TAG-special "
" blocks (i.e.,
		# anything delimited by "``/''" or "```/'''".
		my ($flag, $tmp_body);
		for ( split /\n/, $body1 ){
			if (/^(```?)\s*$/ && ! $flag){
				$tmp_body .= length($1) == 3 ? "
\n" : "
\n";
				$flag = 1;
				next;
			}
			if (/^'''?\s*$/ && $flag){
				$tmp_body .= "
\n"; $flag = 0; next; } if ($flag && /^$/){ $tmp_body .= " \n"; } else { $tmp_body .= "$_\n"; } } $body1 = $tmp_body if $tmp_body; my($tmp1, $tmp2); # Convert all simple (single-string) _em_s and _e_m_s # $body1 =~ s{(^|\s)_(\S+)_($|\s|\.|,)}{$1$2$3}gsm; $body1 =~ s{(^|\s)_(\S+)_($|\s|\.|,|;)}{($tmp1 = $2) =~ tr|_| |; "$1$tmp1$3"}egsm; # Convert all simple (single-string) *strong*s and *str*ongs* # $body1 =~ s{(^|\s)\*(\S+)\*($|\s|\.|,|;)}{$1$2$3}gsm; $body1 =~ s{(^|\s)\*(\S+)\*($|\s|\.|,|;)}{($tmp2 = $2) =~ tr|*| |; "$1$tmp2$3"}egsm; # Hotlink the URLs my $tmp3; # $body1 =~ s#(?$1$2#gsm; $body1 =~ s#(? 85 ? substr($2, 0, 40) . "[...]" . substr($2, -40) : $2; "$1$tmp3"#egsm; # LG addresses get special treatment (relative paths within LG) $body1 =~ s#(a href=["'])http://linuxgazette.net/([^'"\n]+)#$1../$2#g; # Replace text smilies with an image; preserve text as 'alt' tag $body1 =~ s#(:\)|:-\)|:>|:->)#$1#g; # Replace the frownies as well $body1 =~ s#(:\(|:-\(|:<|:-<)#$1#g; # Find any runs of '>'s at line start (quoted email) and
 them
		$body1 =~ s#(^>.*\n)+#\n
\n$&
\n#gm; $body1; %seg; # Find any number of repeated newlines and enclose in

and

, and... $body =~ s#(\n{2,})#\n

$1

\n#gsm; # ...wrap the entire body in

and

. $body =~ s#.*#

\n$&\n

\n#s; # Insert editorial comments $body =~ s|\@\#\$(.*?)\$\#\@|

\n[[[ $1 ]]]\n

\n|gsm; # Clean up overzealous

markup $body =~ s#

\s*\n

#
#gsm;
	$body =~ s#
\s*\n

#
#gsm; # Special processing for René! $body =~ s/=\?iso-8859-15\?Q\?Ren=E9\?=|Ren=E9|Ren=C3=A9|Ren=C3=E9|René|René|Ren\xe9/René/gsm; # Return the much-massaged body $body; } # Create the 'misc/lg' subdir if it doesn't exist unless ( -d "misc/lg" ){ system "/bin/mkdir -p misc/lg" and die "mkdir failed: $!\n"; } # Build up the data structure that contains the processed mailbox my ($subj, $reflink, $iss, $link, $reftitle, $x, $fn, $str, $from, $date, $body, $last, $seen, %files); sub build { # Clear the old content of the special Talkback header undef $reflink; # If it is a Talkback, parse the necessary info out of the subject # if ( $subj =~ m%^tkb:\s*Talkback:\s*((?:issue)?\d+)(/\S+)% ){ if ( $subj =~ m%^tkb:\s*Talkback:\s*((?:issue)?\d+)(/[^#]+)% ){ $iss = $1; $link = "$1$2"; # Read the article title from the article itself (SVN working copy) # if it exists; use LG-local URL otherwise. if ( -f "$ENV{LG_ARTICLES}/$link" ){ open Fh, "$ENV{LG_ARTICLES}/$link" or die "$ENV{LG_ARTICLES}/$link: $!\n"; while ( $x = ){ if ( $x =~ /^title:\s*(.*)\s*$/ ){ $reftitle = $1; last; } } close Fh; } else { $reftitle = $2; } # Create the Talkback header $reflink = qq@[ In reference to "$reftitle" in LG#$iss ]@; } # Remove the marker tags from the subject before using it in the output for ( keys %sections ){ $subj =~ s/^$_:\s*//i; } # If we're at the top of the first post in the thread, append # the subject and the Talkback header (if it exists) unless ( exists $files{$fn} ){ $str = h3("$subj")."\n"; $str .= p(b($reflink)) if $reflink; } $str .= p("\n".b(cleanup($from)).br."\n".b($date)."\n")."\n\n".cleanup($body); # Add the formatted post as an element in an arrayref pointed # to by its (defanged) subject in the '%files' hash push @{$files{$fn}}, $str; # Clear out all the variables used in processing this post $str = $subj = $from = $date = $body = ""; } my %gang; # It's sort of odd to explicitly open a filename that's already in @ARGV... # but if I wanted to do anything else, I'd have to do CLI switch # processing, and Oh, Mother. open In, $in_file or die "$in_file: $!\n"; while ( ){ # Convert to Unix format by removing DOS carriage returns y/\r//d; if ( /^From / .. /^$/ ){ # Get all the participants' email addresses for later checking # against the LG roster $gang{$1}++ if /^From:.*?([a-zA-Z0-9_.+-]+\@[a-zA-Z0-9_.+-]+)/; # Once the body of the email has been built up, process it if ( $body ){ &build; } chomp; # Collapse repeated whitespace y/ \t/ /s; # "Collect" any multi-line subjects into a single string if ( $last && $last eq "subj" && $_ !~ /^\S+:/ ){ $subj .= $_; } $last = ""; $seen = 1; # Normalize the 'From:' line to Mailbag standards if ( /^From:\s*(.*) (\S+)\@(\S+)\s*$/ ){ $from = "$1 [$2 at $3]"; $from =~ tr/<>"//d; $from =~ s/'(.*)'/$1/g; } # Ditto the subject if ( s/^Subject:\s*// ){ s/\[TAG\]\s*//; s/(?:re:|fw:|fwd:|forw:|balasan:|aw:|[\(\[](?:re|fw|fwd|forw|balasan|aw)[\)\]])\s*//ig; $subj = $_; $last = "subj"; } # Get the date $date = $1 if /^Date:\s*(.*?)\s*$/; next; } else { if ( $seen ){ # Flip the flag $seen = 0; # Create a unique filename from the subject ( $fn = $subj ) =~ y/A-Za-z0-9:/_/cs; $fn =~ s/^_*(.*?)_*$/$1/; } # Collect the non-header lines $body .= $_; } } close In; # Add last post &build; ############### Output section #################### # Build the header / define the title sub lg_header { # start_html(-title=>"$_[0]",-lang=>"utf-8",-style=>{src=>"../../../lg.css"}) . "\n". "\n". "\n". "$_[0]\n". "\n". "\n". "\n". "\n". a({href=>"../../../"},img({src=>"../../../gx/2003/newlogo-blank-200-gold2.jpg",id=>"logo",alt=>"Linux Gazette"})) . img({src=>"../../../gx/tux_86x95_indexed.png",id=>"tux",alt=>"Tux"}) . p({id=>"fun"},"...making Linux just a little more fun!") . "
" . a({name=>"top"},""); } my ( $issue, %seen ); # Get the current issue number open Lg, "$ENV{LG_LIBPYTHON}/lgconfig.py" or die "lgconfig.py: $!\n"; while ( ){ if ( /^currentIssueNumber.*?(\d+)/ ){ $issue = $1; last; } } close Lg; my $barename = $fname; $barename =~ s/\.html$//; # Create the "flat content" page for TWDT open Twdt, ">TWDT.lg_answer$issue-$barename.html" or die "TWDT.lg_answer$issue-$barename.html: $!\n"; open Fp, ">$fname" or die "$fname: $!\n"; print Fp "author: $author\ntitle: $title\n\n"; print Twdt h2($title), "\n"; # if the author is defined, get their name if ( $author ){ my $name; open Au, "$ENV{LG_ROOT}/authors/$author"; while ( ){ if ( /^name:\s*(.*)$/ ){ $name = $1; last; } } close Au; print Twdt p( "By ", a({href=>"../authors/$author.html"}, $name ) ); } # Process the participants' bios and extract the names if ( $title =~ /Mailbag/ ){ my %bios; # Do they have an LG bio? for our $Bio ( <$ENV{LG_ROOT}/authors/*> ){ chomp $Bio; open Bio or die "$Bio: $!\n"; for ( ){ chomp; last if /^$/; my( $k, $v ) = split /: ?/; if ( $v ){ $v =~ s/\s*$//; $bios{$Bio}{$k} = $v; } } } # Build list of LG participants my @took_part; for my $email ( sort keys %gang ){ for ( keys %bios ){ push @took_part, $bios{$_}{name} if $bios{$_}{privateEmail} && $bios{$_}{privateEmail} eq $email; } } print Fp h3( "This month's answers created by:" ), strong( "[ ", join( ", ", @took_part ), " ]" ), "\n", br, "...and you, our readers!", br, hr({size=>"3", width=>"50%", align=>"center"}); print Twdt h3( "This month's answers created by:" ), strong( "[ ", join( ", ", @took_part ), " ]" ), "\n", br, "...and you, our readers!", br, hr({size=>"3", width=>"50%", align=>"center"}); } # Add editorial commentary if a "-head" file exists if ( -f "$in_file-head" ){ print Fp h2("Editor's Note"); print Twdt h2("Editor's Note"); open Cmt, "$in_file-head" or die "$in_file-head: $!\n"; print Fp while ; print Twdt while ; print Fp hr({size=>"3", width=>"50%", align=>"center"}); print Twdt hr({size=>"3", width=>"50%", align=>"center"}); close Cmt; } # Define weighted sorting order for different categories (i.e., Mailbag is # first, etc.) See %sections at the top for relative weights. sub weight { my $str = substr $_[0], 0, 4; ( $str =~ s/:$// ) || return 100; return ( exists $sections{lc $str} ) ? $sections{lc $str}->[0] : 100; } # Create/overwrite the KnowledgeBase link file open Kb, ">$tag_kb/$issue-$fname" or die "$issue-$fname: $!\n"; for my $key ( sort { weight($a) <=> weight($b) } keys %files ){ my $sec = $key =~ /^(...):/ ? lc $1 : ""; # Flatten the key to create the link name ( my $lnk = $key ) =~ s/^...:_*//; $lnk = lc $lnk; $lnk =~ tr/a-z0-9_/_/sc; $lnk =~ s/^_*(.*?)_*(?:html)?_*$/$1/; # Print the section header just once if ( exists $sections{ $sec } ){ # Only print the section headers if they are part of the Mailbag # page if ( $title eq "Mailbag" ){ print Fp h1( $sections{ $sec }->[1]), "\n", hr, "\n" unless $seen{ $sec }++; print Twdt h1( $sections{ $sec }->[1]), "\n", hr, "\n" unless $seen{ $sec }++; } } else { if ($fname eq "lg_mail.html"){ print Fp h1("Our Mailbag"), "\n", hr, "\n" unless $seen{mailbag}++; print Twdt h1("Our Mailbag"), "\n", hr, "\n" unless $seen{mailbag}++; } } # Extract the subject ( my $subject = $files{ $key }->[0] ) =~ s#^.*?

([^<]+)

.*#$1#s; # Create unique anchor for thread print Fp "\n\n"; print Twdt "\n\n"; print Kb "[ LG #$issue ] $subject
\n"; my($shortie); # Chop off the first post if it's too long if ( length $files{ $key }->[0] > $cutoff ){ $shortie = substr $files{ $key }->[0], 0, ( rindex $files{ $key }->[0], "

", $cutoff ); print Fp $shortie, p("\n[ ... ]\n"); } else { print Fp $files{ $key }->[0], "\n"; } for ( @{ $files{ $key } } ){ print Twdt $_, br; } if ( $shortie || @{ $files{ $key } } > 1 ){ if (-f "misc/lg/$lnk.html"){ print "\nERROR: while processing '$in_file', I ran across a thread named '$lnk' that was longer\n" . "than the defined thread cutoff length ($cutoff characters.) This would normally result in\n" . "the creation of a file called 'misc/lg/$lnk.html', but this file ALREADY EXISTS.\n\n" . "Since there's no way for me to tell whether this is a result of an error (e.g., threads with\n" . "conflicting names from different mailboxes) or an accidental second run of this program on a\n" . "given mailbox, I have to give up and turn this over to a human.\n\n" . "To resolve the first problem, I suggest locating and renaming the thread in one of the mailboxes.\n" . "To resolve the second one, simply delete the 'misc/' directory and reprocess all the mailboxes.\n\n"; exit; } $shortie = ""; my $tnum = @{ $files{ $key } }; my $s = $tnum == 1 ? "" : "s"; my $tlen = sprintf "%.2f", length("@{ $files{ $key } }") / 1024; print Fp p( b("[ ", a({name=>"mb-$lnk"},""), a({href=>"misc/lg/$lnk.html"},"Thread continues here ($tnum message$s/${tlen}kB)"), " ]" ) ), "\n"; open Th, ">misc/lg/$lnk.html" or die "misc/lg/$lnk.html: $!\n"; print Th lg_header($subject); for ( @{ $files{ $key } } ){ # Fix the LG relative links in the thread s#(a href=['"])(\.\./)#$1$2$2$2#gsm; print Th $_, br, a({href=>"#top"},"Top"), " " x 4, a({href=>"../../$fname#mb-$lnk"},"Back"), hr({width=>"50%", align=>"left"}), p(br); } print Th "
", end_html; close Th; } print Fp hr, "\n\n"; print Twdt hr, "\n\n"; } close Fp; close Kb; close Twdt; # Make the KB file accessible to everyone chmod 0666, "$tag_kb/$issue-$fname"; =head1 NAME lg-process-mailbag - Processes the LG mailbox for publication =head1 SYNOPSIS lg-process-mailbag [output_filename.html] [title] NOTE: The exact output filename is significant, since it is used by the program to create the HTML title and the page heading if it matches a pattern. The equivalence table currently looks like this (it may be expanded later): Filename Title/header ======== ============ 'lg_talkback.html' => "Talkback", 'lg_talkback2.html' => "Talkback2", 'lg_talkback3.html' => "Talkback3", 'lg_talkback4.html' => "Talkback4", 'lg_tips.html' => "2-Cent Tips", 'lg_laundrette.html' => "The Linux Launderette", 'lg_launderette.html' => "The Linux Launderette", If no filename is specified, the output will be sent to 'lg_mail.html', and the title/header will default to "Mailbag". An optional title can also be specified as the last argument (after the filename); if it contains anything other than an unbroken string of alphanumeric characters, the entire title should be quoted. If a file with the same name as the input file plus a "-head" extension exists in the current directory, the content of that file will be inserted as editorial commentary below an "Editor's Note" header. The insert will be positioned below the title and credits but above the processed content. The content of the '-head' file should be HTML-formatted but should not contain an HTML header or footer. 'lg-process-mailbag' also creates a file containing a list of links, one per topic, for later insertion into the KnowledgeBase; the file name consists of the current issue number followed by a "-" and the name of the current output file. It is saved in a directory defined in the user-configurable section at the top of the script ('$SVN_ROOT/data/kb/' by default.) A "TWDT.lg_answer-.html" is also created. These are "flat" (non-threaded) representations of mailbag content, and are intended to be concatenated (assuming there's more than one) into "TWDT.lg_answer.html", which should be copied into $LG_ROOT/data/twdt, where it will be read by our publication scripts and inserted into the TWDT during processing. [NOTE: Currently, this process isn't usable - the build scripts need to be hacked to ignore all the varieties of "lg_mail.html" while building the issue. For now, simply ignore the resulting "TWDT" file.) =head1 DESCRIPTION 'lg-process-mailbag' is the LG Mailbag processing script, which also handles 'Gazette Matters', 'Still Searching' (unanswered questions), and other sections. Its output is a page containing all the first posts for each thread in TAG (those that are too long are chopped off at the last paragraph marker before the character count defined in '$cutoff' at the top of the script) which are followed by links to a file comprising the rest of the thread (if any.) It's made to be as automated as possible in order to minimize hand-hacking; since humans will be humans, however, some twiddling is usually required. :) =head1 USAGE 1) Open the TAG mail archive in Mutt and delete all the repeated and "uninteresting" messages (spam, broken messages from 'bogus', etc.) 2) Tag (using 't' for single messages, or 'esc-t' for threads) all messages that don't belong in the mailbag (e.g. Talkbacks, Launderette, etc.) and save them into distinctively-named mailboxes with the ';s' (tagged-save) command. The 'l' (limit) key can also be very helpful in selecting the messages that fit a pattern. 3) All messages with identical 'Subject:' lines will be grouped into individual threads by the script; therefore, messages are added to or removed from threads them by changing their subject lines (but see the "Tags" subsection in "EDITING", below.) 4) Quit Mutt and run the script to build the Mailbag page and the related files in '$PWD/misc/lg/'. 5) Review the produced HTML file and fix any problems you discover by editing the mailbox directly (don't edit individual messages from within Mutt; this changes the position of the email within the file and thus its position within the thread.) Please see the EDITING section, below, for specific editing tips and tricks. 6) Take a look at the files linked from within the Mailbag page (the links are at the bottom of each post that exceeds '$cutoff' in length) and fix any really serious problems. The idea here is to pluck the low-hanging fruit; as long as the content is easily readable, small discrepancies don't matter. 7) When you're done, move the output file and the 'misc' subdirectory into $LG_ARTICLES/[current_issue]/, then copy the original mbox to $LG_ROOT/data/tag/ (you'll need to create the 'issue_num' directory) and check everything into the repository with 'svn ci'. =head1 EDITING =head2 Tags Any email that is marked with a special tag will be placed in the section specified by that tag. Tags consist of a three-letter abbreviation and a colon at the start of the subject, as follows: gaz: Gazette Matters sts: Still Searching tkb: Talkbacks tct: 2-Cent Tips gem: LG Mail Gems (e.g. 'Subject: sts: Looking for moonbat weasels in Cleveland'.) Tag case is not significant, and the tags will be removed from the subject during processing. The sections in the generated Mailbag page are arranged in the above order; any email which does not have a tag will go into the main ("Mailbag") section, which comes last. =head2 Formatting Problems Most of the HTML conversion is done by the program, which is fairly smart. However, as any automated gadget, sometimes it's going to get things wrong. =over 1 =item * Unformatted code The most common problem is code that gets all wrapped into one (or more) lines instead of being nicely formatted. The reason is that it wasn't properly delimited in the original email (i.e., with either a pair or a trio of backticks and single quotes - see the Members FAQ, http://linuxgazette.net/faq/members-faq.html#markup for exact details.) FIX: Edit the original mail and insert the appropriate delimiters before and after the code. =item * Collapsed lists If someone has a neat little list of items in their mail, it is also going to get wrapped. However, converting it to preformatted code (essentially using
 tags, as in the above example) is
usually inapproprate due to the font used in 
s.

FIX: Enter a blank line between the individual list items. They will
now become separate paragraphs, which looks just fine.

=item *
Huge amounts of whitespace in preformatted items

This is something like the opposite of the last problem; what is
happening is that in preformatted items other than the standard
backtick/single quote delimited code (e.g., quoted email which is
denoted by '>'s at the start of the line), multiple blank lines 
are still converted to paragraphs. In general, this shouldn't happen - 
the greatest majority of this is handled by the script - but there's
still a small chance of it.

FIX: Insert a single space at the beginning of each blank line within
the quoted material. The '

' pairs only get inserted into successive runs of newlines (returns), and adding a space makes it not match the search pattern anymore. _Voila,_ no more Antarctica (wide-open expanses of white.) =back Most other layout problems can be cured by making the text preformatted - i.e., by wrapping it in backtick/single quote sets. =head1 SPECIFIC FEATURES As mentioned earlier, the script is fairly smart about processing mail text. Here are some of its built-in features; it's important to recognize them and to know how they work, since the easiest way to fix problems is often a matter of adjusting the content to follow the rules that it should have followed in the first place. =over 1 =item * [PRIVATE]...[/PRIVATE] clause Anything delimited with the above tags will be replaced with a '[[[ Content elided ]]]' string. Not that TAG is a private list, but having the option is nice. =item * [RAW]...[/RAW] clause Anything delimited with these tags is "protected" from processing and will not be modified by the script. Particularly useful for any HTML entities or tags that you want to preserve as they stand (i.e., that you want to be used as HTML tags rather than displayed as content.) =item * The TAG footer is automatically removed This footer is the standard block of text that is appended to all TAG mail; it is defined as starting with '+-+--- [...] ---+-+' and ending with "http://lists.linuxgazette.net/mailman/listinfo/tag", with several lines of text between the two. =item * Non-HTML-parseable characters are converted to entities '<', '>', '&', and all ASCII characters from 160 to 255 are converted to their HTML equivalents. =item * Mailman's "Attachment was scrubbed" messages are removed These are defined as the above phrase plus a URL where the attachment is stored. =item * All signature blocks are converted to preformatted text A signature block is a line consisting of two dashes, a space, and a newline followed by any number of non-blank lines. If you see a signature block getting wrapped into a one-line mess, just insert the '-- ' delimiter immediately above it. =item * Cited email headers are preformatted Cited email headers (i.e., a run of lines beginning with any of "Date|From|To|Subject|Newsgroups|User-Agent|X-Rcpt-To|X-Country|X-UIDL|X-Bogosity|X-Mas") will be converted to formatted text. If they don't start at the beginning of the line, they will not be converted - so formatting them is often a matter of removing the preceding whitespace. =item * Quoted email text is preformatted Text preceded by '>'s at the beginning of the line will have its layout preserved. =item * All "TAG markup" will be appropriately converted Any blocks of text preceded by two backticks and followed by two single quotes will be turned into formatted text; three of each will mark it as '

' (formatted in a nice colored box - preferred for
actual code examples.) Make sure that these markers start at the beginning
of the line, and that nothing else follows them on that line.

=item *
"Email enhancements" are appropriately applied

Single words in *asterisks* will be made bold; those in _underscores_ will
be made italic. Note that words in /slashes/ will be ignored although they
are commonly used in email: treating them specially would screw up file paths...

=item *
URLs are hotlinked

Anything beginning with 'http://' is treated as a URL and is automatically
converted to a hotlink. LG URLs are turned into hotlinks with URLs pointing
to the local file structure.

=item *
Smilies are replaced with images (and use the text as an 'alt' link); ditto
the 'frownies'.

':)', ':-)', ':>', and ':->' will point to 'smile.png'.
':(', ':-(', ':<', and ':-<' will point to 'frown.png'.

=item *
Blank lines delimit paragraphs

=item *
Editorial comments

If you wish to comment on something in the text, feel free: start with a
line consisting of '@#$', insert whatever you want to write, and finish it
with a line consisting of '$#@'. Your insert will become an editorial comment.

=back

=head1 AUTHOR

Ben Okopnik (ben@linuxgazette.net)

=cut