Don't redo binarise if not needed.
[dja/scandal.git] / scan2pages.sh
index b2bad10..cbfe52f 100755 (executable)
@@ -125,7 +125,28 @@ for scanpgnum in `$my_seq 1 $pages`; do
                        $scanpg || exit 1
        fi;
        
-       # preprocess it!
+done;
+
+# do ocr binarise
+if [ -e $outdir/scanpgs ]; then
+       #assume (for the sake of speed in repeated runs) that
+       # if the last file exists, the process completed successfully
+       if [ -e $outdir/scanpgs/$(printf '%04d' $pages).bin.png ]; then
+               [[ $verbose ]] && echo Binarisation already complete
+       else
+               rm -r $outdir/scanpgs
+       fi
+fi
+[ -e $outdir/scanpgs ] || ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1
+
+# process each binarised scan page
+for scanpgnum in  `$my_seq 1 $pages`; do
+
+       binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
+       scanpgnum=$(printf '%03d' $scanpgnum)
+       scanpg=$outdir/scanpg-${scanpgnum}.png
+
+       # preprocess scanned page
        cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm
        if [ ! -e $cleanscanpg ]; then
                # create mask: 
@@ -137,9 +158,7 @@ for scanpgnum in `$my_seq 1 $pages`; do
                cropcords=$(convert -border 1x1 -bordercolor '#000' -resize 1000% -trim -fuzz 90% -format "%wx%h%O" $outdir/scanpg-mask-${scanpgnum}.png info: || exit 1)
                
                # ... crop and despeckle? the final pre-prepared image
-               convert $convertflags -crop $cropcords $scanpg $cleanscanpg || exit 1
-       elif [[ $skipmask ]]; then
-               cp $origfile $preppnm
+               convert $convertflags -crop $cropcords $binscanpg  $cleanscanpg || exit 1
        fi;
        
        # check it hasn't mostly disappeared - e.g. if the scan was all black
@@ -152,7 +171,11 @@ for scanpgnum in `$my_seq 1 $pages`; do
        #unpaper it
        physpgbase=$outdir/physpg-${scanpgnum}
        if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then
-               unpaper $unpaperflags --layout double --overwrite -ni 10 -op 2 $cleanscanpg ${physpgbase}-%01d.pnm || exit 1
+               # most of unpaper's processing is redundant given ocropus, and somewhat too agressive
+               unpaper $unpaperflags --layout double --overwrite -op 2 \
+                       --no-processing \
+                       $cleanscanpg ${physpgbase}-%01d.pnm || exit 1
+
        fi;
        
        for physpgnum in $($my_seq 1 2); do
@@ -187,18 +210,28 @@ for scanpgnum in `$my_seq 1 $pages`; do
                        cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm
                fi;
 
-       
+               #prepare for ocr
+               convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1
+
                #final convert and clean w/ bebook optimisation
                if [[ $bebook ]]; then
-                       convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
+                       convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
                else
                        convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
                fi
-       
+
        done;
 
 done
 
+#try full ocr
+rm -rf $outdir/logpgs
+ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
+ocropus pages2lines $outdir/logpgs
+ocropus lines2fsts $outdir/logpgs
+ocropus fsts2bestpaths $outdir/logpgs
+ocropus buildhtml $outdir/logpgs > $outdir/out.html
+
 mkdir -p $outdir/pages
 mv $outdir/final-*.${extension} $outdir/pages
        

UCC git Repository :: git.ucc.asn.au