I spent a little time recently in figuring out how to compile various pages into a pdf book. The key to doing this, I found, is to use 'htmldoc' - which can be downloaded for various platforms easily.
Here is a summary of the scripts that I used.
make_pdf.sh controls the overall processing, it uses a list of files stored in 'chapters.txt' to determine the files to process.
#!/bin/sh rm -f missing_files rm -f image_files i=0 while read CHAPTER do i=`expr $i + 1` TMP=`echo $i | awk '{printf "tmp%04d.html", $1}'` echo $TMP ./txttohtml.sh $CHAPTER | sed -f greek.sed > $TMP if [ $i -eq "500" ] then break fi done < chapters.txt htmldoc --book --linkstyle plain --toctitle "The Molecular Universe" -f output.pdf --no-title --headfootfont times --headfootsize 10 --charset iso-8859-7 --embedfonts --size letter tmp*.html --titlefile title.html ./make_index.sh output.pdf 12 # make a report on the current status to append to the book WORDS=`wc output.txt | awk '{print $2}'` MISSING=`wc missing_files | awk '{print $1}'` IMAGES=`wc image_files | awk '{print $1}'` echo "<pre>" > status.html echo "PDF file created:" >> status.html date >> status.html echo "" >> status.html echo "Current word total: " $WORDS >> status.html echo "" >> status.html echo "Number of image files that should be enlarged: " $MISSING >> status.html echo "" >> status.html echo "Missing file names follow: " >> status.html cat missing_files >> status.html echo "" >> status.html echo "Current image file count: " $IMAGES >> status.html echo "All image file names follow: " >> status.html cat image_files >> status.html echo "</pre>" >> status.html htmldoc --webpage -f status.pdf --no-title --size letter status.html pdftk A=output.pdf B=output.index.pdf C=status.pdf output output.pdf rm -f output.pdf output.txt output.index.pdf output.data.txt status.html status.pdf if [ -f missing_files ] then echo "THERE ARE " `wc missing_files | awk '{print $1}'` " MISSING FILES" cat missing_files fi
txttohtml.sh is a very crude script that converts the raw nanoblogger txt file into a crude html file which can be used as the input to the htmldoc processor.
#!/bin/sh awk '{ if(match($0,"TITLE:")){ title=substr($0,7) print "<h1> " title " </h1>" } if(match($0,"<blockquote>")){ print "<table border=\"1\" cellpadding=\"10\"><tr><td>" getline sub("Note:","<b>Note:</b>"); print $0 next } if(match($0,"</blockquote>")){ print "</td></tr></table>" next } if(match($0,"BODY:")){ intext=1 next } if(!intext)next if(match($0,"END-----")){ intext=0 exit } if(match($0,"<table class=\"image")){ imagetable=1 } if(match($0,"\"left\"") && imagetable){ sub("\"left\"","\"center\""); } if(match($0,"\"right\"") && imagetable){ sub("\"right\"","\"center\""); } if(match($0,"<img src=")){ record=$0 sub("^.*<img src=","",record); sub(" .*$","",record); sub("^.*/","",record); sub("\"","",record); sub("_scale","",record); largefile=record largefile="../../../images/" largefile located=0 # look for file with original extension line="" getline line < largefile close largefile if(length(line)>0){ largefile="\""largefile"\"" sub("\".*\"",largefile) located=1 } # look for file with .png extension line="" sub(".gif",".png",largefile) getline line < largefile close largefile if(length(line)>0){ largefile="\""largefile"\"" sub("\".*\"",largefile) located=1 } if(!located){ print "COULD NOT LOCATE " record > "/dev/tty" print record >> "missing_files" } print largefile >> "image_files" } if(match($0,"</table") && imagetable){ imagetable=0 print $0 print "<table width=\"100%\" summary=\"\"><tr><td> </td></tr></table>" next } print $0 }' $1
make_index.sh comes from pdftk and uses two tools from the pdftk site to create a crude book-like index for the the book. I modified the make_index.sh script a little - so I am including it here.
#!/bin/sh # make_index.sh, version 1.0 # usage: make_index.sh <PDF filename> <page window> # requires: pdftk, kw_catcher, page_refs, # pdftotext, enscript, ps2pdf # # by Ross Presser, Imtek.com # adapted by Sid Steward # http://www.pdfhacks.com/kw_index/ # modified somewhat from the original distributed version to correct # problems encountered in initial testing export PATH=/opt/local/bin:/opt/local/sbin:$PATH LANG=C fname=`basename $1 .pdf` pdftk ${fname}.pdf dump_data output ${fname}.data.txt && \ sed 's/LowercaseRomanNumerals/DecimalArabicNumerals/' ${fname}.data.txt > j && \ mv j ${fname}.data.txt && \ pdftotext ${fname}.pdf ${fname}.txt && \ page_refs ${fname}.txt index-terms.dat ${fname}.data.txt \ | sed 's/PageLabelNumStyle://g' \ | enscript --columns 2 --font 'Times-Roman@10' \ --header '|Index' --header-font 'ArialBold@20' \ --margins 54:54:36:54 --word-wrap --output - \ | ps2pdf - ${fname}.index.pdf