3. determine dpi
4. foreach double-page-spread (scan page)
4.1. extract scan page from pdf, save as png
- 4.2. run a mask over it to pull off large black areas
- 4.3. run unpaper over it, creating 2 pages (physical page)
- 4.4. foreach physical page
- 4.4.1. remask and retrim
- 4.4.2. attempt to detect if a physical page contains 2 logical pages,
- 4.4.2.1. if so split with unpaper
- 4.4.3. do any final processing (resize for bebook)
-5. move all the final pictures into a final picture directory
-In the accidentally deleted code we used ocropus's binarise stuff to do some
-extra cleaning.
+5. run ocropus's binarise over all the pngs
+
+6. foreach binarised scan page
+ 6.1. create a mask from the original (unbinarised) page
+ 6.2. use the mask to trim the binarised page (cutting this off improves unpaper's accuracy)
+ 6.3. run unpaper over the clean binarised page, creating 2 pages (physical page)
+ 6.4. foreach physical page
+ 6.4.1. remask and retrim
+ 6.4.2. attempt to detect if a physical page contains 2 logical pages,
+ 6.4.2.1. if so split with unpaper
+ 6.4.3. do any final processing (resize for bebook)
+7. move all the final pictures into a final picture directory
= What options do we need? =
Anything we attempt to detect automatically should have the option to set manually
$scanpg || exit 1
fi;
- # preprocess it!
+done;
+
+# do ocr binarise
+if [ -e $outdir/scanpgs ]; then
+ #assume (for the sake of speed in repeated runs) that
+ # if the last file exists, the process completed successfully
+ if [ -e $outdir/scanpgs/$(printf '%04d' $pages).bin.png ]; then
+ [[ $verbose ]] && echo Binarisation already complete
+ else
+ rm -r $outdir/scanpgs
+ fi
+fi
+[ -e $outdir/scanpgs ] || ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1
+
+# process each binarised scan page
+for scanpgnum in `$my_seq 1 $pages`; do
+
+ binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
+ scanpgnum=$(printf '%03d' $scanpgnum)
+ scanpg=$outdir/scanpg-${scanpgnum}.png
+
+ # preprocess scanned page
cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm
if [ ! -e $cleanscanpg ]; then
# create mask:
# ... get crop co-ords. They're off by ~2 as I don't know how to
# properly correct for the border.
cropcords=$(convert -border 1x1 -bordercolor '#000' -resize 1000% -trim -fuzz 90% -format "%wx%h%O" $outdir/scanpg-mask-${scanpgnum}.png info: || exit 1)
-
+
# ... crop and despeckle? the final pre-prepared image
- convert $convertflags -crop $cropcords $scanpg $cleanscanpg || exit 1
- elif [[ $skipmask ]]; then
- cp $origfile $preppnm
+ convert $convertflags -crop $cropcords $binscanpg $cleanscanpg || exit 1
fi;
# check it hasn't mostly disappeared - e.g. if the scan was all black
#unpaper it
physpgbase=$outdir/physpg-${scanpgnum}
if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then
- unpaper $unpaperflags --layout double --overwrite -ni 10 -op 2 $cleanscanpg ${physpgbase}-%01d.pnm || exit 1
+ # most of unpaper's processing is redundant given ocropus, and somewhat too agressive
+ unpaper $unpaperflags --layout double --overwrite -op 2 \
+ --no-processing \
+ $cleanscanpg ${physpgbase}-%01d.pnm || exit 1
+
fi;
for physpgnum in $($my_seq 1 2); do
convert $convertflags -resize 10% -depth 8 -gamma 0.01 -median 2 $physpg $outdir/physpg-mask-${scanpgnum}-${physpgnum}.png ||exit 1
# Trim #-border 1x1 -bordercolor '#fff' -trim -fuzz 30%
- cropcords=$(convert -trim -fuzz 90%\
+ # binarise is so effective, try something << 90%
+ cropcords=$(convert -trim -fuzz 50%\
-resize 1000% -format "%wx%h%O" $outdir/physpg-mask-${scanpgnum}-${physpgnum}.png info: || exit 1)
+ [[ $verbose ]] && echo Crop co-ords: $cropcords
+
# ... crop and despeckle? the final pre-prepared image
convert $convertflags -crop $cropcords $physpg $cleanphyspg || exit 1
fi;
-
+
+ # check it hasn't mostly disappeared, warn viciously if it has!
+ if [[ $(convert $cleanphyspg -format '%[fx:s.w*s.h>1000]' info:) = "0" ]]; then
+ echo "Warning: discarding physical pg ${scanpgnum}-${physpgnum}: not enough remains after masking."
+ continue;
+ fi;
+
#detect if the page is 2-up
if [[ $logperphys == 2 ]] || ( [[ $logperphys != 1 ]] && $(dirname $0)/detect2pages.sh ${cleanphyspg} ${scanpgnum} ${physpgnum} ); then
cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm
fi;
-
+ #prepare for ocr
+ convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1
+
#final convert and clean w/ bebook optimisation
- if [[ $bebook ]]; then
- convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
+ if [[ $bebook ]]; then #1200x1600
+ convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
else
convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
fi
-
+
done;
done
+#don't try full ocr, it's a waste of time.
+#rm -rf $outdir/logpgs
+#ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
+#ocropus pages2lines $outdir/logpgs
+#ocropus lines2fsts $outdir/logpgs
+#ocropus fsts2bestpaths $outdir/logpgs
+#ocropus buildhtml $outdir/logpgs > $outdir/out.html
+
mkdir -p $outdir/pages
mv $outdir/final-*.${extension} $outdir/pages