a dependency list: working towards a simpler install process
[dja/scandal.git] / scan2pages.sh
index dbd66ee..5fbdab5 100755 (executable)
@@ -128,9 +128,18 @@ for scanpgnum in `$my_seq 1 $pages`; do
 done;
 
 # do ocr binarise
-[ -e $outdir/scanpgs ] && rm -r $outdir/scanpgs
-ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1
+if [ -e $outdir/scanpgs ]; then
+       #assume (for the sake of speed in repeated runs) that
+       # if the last file exists, the process completed successfully
+       if [ -e $outdir/scanpgs/$(printf '%04d' $pages).bin.png ]; then
+               [[ $verbose ]] && echo Binarisation already complete
+       else
+               rm -r $outdir/scanpgs
+       fi
+fi
+[ -e $outdir/scanpgs ] || ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1
 
+# process each binarised scan page
 for scanpgnum in  `$my_seq 1 $pages`; do
 
        binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
@@ -147,7 +156,7 @@ for scanpgnum in  `$my_seq 1 $pages`; do
                # ... get crop co-ords. They're off by ~2 as I don't know how to
                # properly correct for the border.
                cropcords=$(convert -border 1x1 -bordercolor '#000' -resize 1000% -trim -fuzz 90% -format "%wx%h%O" $outdir/scanpg-mask-${scanpgnum}.png info: || exit 1)
-               
+
                # ... crop and despeckle? the final pre-prepared image
                convert $convertflags -crop $cropcords $binscanpg  $cleanscanpg || exit 1
        fi;
@@ -162,7 +171,11 @@ for scanpgnum in  `$my_seq 1 $pages`; do
        #unpaper it
        physpgbase=$outdir/physpg-${scanpgnum}
        if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then
-               unpaper $unpaperflags --layout double --overwrite -ni 10 -op 2 $cleanscanpg ${physpgbase}-%01d.pnm || exit 1
+               # most of unpaper's processing is redundant given ocropus, and somewhat too agressive
+               unpaper $unpaperflags --layout double --overwrite -op 2 \
+                       --no-processing \
+                       $cleanscanpg ${physpgbase}-%01d.pnm || exit 1
+
        fi;
        
        for physpgnum in $($my_seq 1 2); do
@@ -177,13 +190,22 @@ for scanpgnum in  `$my_seq 1 $pages`; do
                        convert $convertflags -resize 10% -depth 8 -gamma 0.01 -median 2 $physpg $outdir/physpg-mask-${scanpgnum}-${physpgnum}.png ||exit 1
 
                        # Trim #-border 1x1 -bordercolor '#fff' -trim -fuzz 30% 
-                       cropcords=$(convert -trim -fuzz 90%\
+                       # binarise is so effective, try something << 90%
+                       cropcords=$(convert -trim -fuzz 50%\
                         -resize 1000% -format "%wx%h%O" $outdir/physpg-mask-${scanpgnum}-${physpgnum}.png info: || exit 1)
                
+                       [[ $verbose ]] && echo Crop co-ords: $cropcords
+
                        # ... crop and despeckle? the final pre-prepared image
                        convert $convertflags -crop $cropcords $physpg $cleanphyspg || exit 1
                fi;
-               
+
+               # check it hasn't mostly disappeared, warn viciously if it has!
+               if [[ $(convert  $cleanphyspg -format '%[fx:s.w*s.h>1000]' info:) = "0" ]]; then
+                       echo "Warning: discarding physical pg ${scanpgnum}-${physpgnum}: not enough remains after masking."
+                       continue;
+               fi;
+
                #detect if the page is 2-up
 
                if [[ $logperphys == 2 ]] || ( [[ $logperphys != 1 ]] && $(dirname $0)/detect2pages.sh ${cleanphyspg} ${scanpgnum} ${physpgnum} ); then
@@ -201,8 +223,8 @@ for scanpgnum in  `$my_seq 1 $pages`; do
                convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1
 
                #final convert and clean w/ bebook optimisation
-               if [[ $bebook ]]; then
-                       convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
+               if [[ $bebook ]]; then #1200x1600
+                       convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
                else
                        convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
                fi
@@ -211,13 +233,13 @@ for scanpgnum in  `$my_seq 1 $pages`; do
 
 done
 
-#try full ocr
-rm -rf $outdir/logpgs
-ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
-ocropus pages2lines $outdir/logpgs
-ocropus lines2fsts $outdir/logpgs
-ocropus fsts2bestpaths $outdir/logpgs
-ocropus buildhtml $outdir/logpgs > $outdir/out.html
+#don't try full ocr, it's a waste of time.
+#rm -rf $outdir/logpgs
+#ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
+#ocropus pages2lines $outdir/logpgs
+#ocropus lines2fsts $outdir/logpgs
+#ocropus fsts2bestpaths $outdir/logpgs
+#ocropus buildhtml $outdir/logpgs > $outdir/out.html
 
 mkdir -p $outdir/pages
 mv $outdir/final-*.${extension} $outdir/pages

UCC git Repository :: git.ucc.asn.au