X-Git-Url: https://git.ucc.asn.au/?p=dja%2Fscandal.git;a=blobdiff_plain;f=scan2pages.sh;fp=scan2pages.sh;h=5fbdab5ffdeba68b5684bdad7a60c59fb525574d;hp=b2bad107ea4837582e6ce27de538b75929c0a568;hb=8fe732e83a703d38e483d0925f0c3bc1b2683910;hpb=2312058a395f4cedf83c5c19434a9c9b400c9e56 diff --git a/scan2pages.sh b/scan2pages.sh index b2bad10..5fbdab5 100755 --- a/scan2pages.sh +++ b/scan2pages.sh @@ -125,7 +125,28 @@ for scanpgnum in `$my_seq 1 $pages`; do $scanpg || exit 1 fi; - # preprocess it! +done; + +# do ocr binarise +if [ -e $outdir/scanpgs ]; then + #assume (for the sake of speed in repeated runs) that + # if the last file exists, the process completed successfully + if [ -e $outdir/scanpgs/$(printf '%04d' $pages).bin.png ]; then + [[ $verbose ]] && echo Binarisation already complete + else + rm -r $outdir/scanpgs + fi +fi +[ -e $outdir/scanpgs ] || ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1 + +# process each binarised scan page +for scanpgnum in `$my_seq 1 $pages`; do + + binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png + scanpgnum=$(printf '%03d' $scanpgnum) + scanpg=$outdir/scanpg-${scanpgnum}.png + + # preprocess scanned page cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm if [ ! -e $cleanscanpg ]; then # create mask: @@ -135,11 +156,9 @@ for scanpgnum in `$my_seq 1 $pages`; do # ... get crop co-ords. They're off by ~2 as I don't know how to # properly correct for the border. cropcords=$(convert -border 1x1 -bordercolor '#000' -resize 1000% -trim -fuzz 90% -format "%wx%h%O" $outdir/scanpg-mask-${scanpgnum}.png info: || exit 1) - + # ... crop and despeckle? the final pre-prepared image - convert $convertflags -crop $cropcords $scanpg $cleanscanpg || exit 1 - elif [[ $skipmask ]]; then - cp $origfile $preppnm + convert $convertflags -crop $cropcords $binscanpg $cleanscanpg || exit 1 fi; # check it hasn't mostly disappeared - e.g. if the scan was all black @@ -152,7 +171,11 @@ for scanpgnum in `$my_seq 1 $pages`; do #unpaper it physpgbase=$outdir/physpg-${scanpgnum} if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then - unpaper $unpaperflags --layout double --overwrite -ni 10 -op 2 $cleanscanpg ${physpgbase}-%01d.pnm || exit 1 + # most of unpaper's processing is redundant given ocropus, and somewhat too agressive + unpaper $unpaperflags --layout double --overwrite -op 2 \ + --no-processing \ + $cleanscanpg ${physpgbase}-%01d.pnm || exit 1 + fi; for physpgnum in $($my_seq 1 2); do @@ -167,13 +190,22 @@ for scanpgnum in `$my_seq 1 $pages`; do convert $convertflags -resize 10% -depth 8 -gamma 0.01 -median 2 $physpg $outdir/physpg-mask-${scanpgnum}-${physpgnum}.png ||exit 1 # Trim #-border 1x1 -bordercolor '#fff' -trim -fuzz 30% - cropcords=$(convert -trim -fuzz 90%\ + # binarise is so effective, try something << 90% + cropcords=$(convert -trim -fuzz 50%\ -resize 1000% -format "%wx%h%O" $outdir/physpg-mask-${scanpgnum}-${physpgnum}.png info: || exit 1) + [[ $verbose ]] && echo Crop co-ords: $cropcords + # ... crop and despeckle? the final pre-prepared image convert $convertflags -crop $cropcords $physpg $cleanphyspg || exit 1 fi; - + + # check it hasn't mostly disappeared, warn viciously if it has! + if [[ $(convert $cleanphyspg -format '%[fx:s.w*s.h>1000]' info:) = "0" ]]; then + echo "Warning: discarding physical pg ${scanpgnum}-${physpgnum}: not enough remains after masking." + continue; + fi; + #detect if the page is 2-up if [[ $logperphys == 2 ]] || ( [[ $logperphys != 1 ]] && $(dirname $0)/detect2pages.sh ${cleanphyspg} ${scanpgnum} ${physpgnum} ); then @@ -187,18 +219,28 @@ for scanpgnum in `$my_seq 1 $pages`; do cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm fi; - + #prepare for ocr + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1 + #final convert and clean w/ bebook optimisation - if [[ $bebook ]]; then - convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 + if [[ $bebook ]]; then #1200x1600 + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 else convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 fi - + done; done +#don't try full ocr, it's a waste of time. +#rm -rf $outdir/logpgs +#ocropus book2pages $outdir/logpgs $outdir/logpg-*.png +#ocropus pages2lines $outdir/logpgs +#ocropus lines2fsts $outdir/logpgs +#ocropus fsts2bestpaths $outdir/logpgs +#ocropus buildhtml $outdir/logpgs > $outdir/out.html + mkdir -p $outdir/pages mv $outdir/final-*.${extension} $outdir/pages