X-Git-Url: https://git.ucc.asn.au/?p=dja%2Fscandal.git;a=blobdiff_plain;f=scan2pages.sh;h=cbfe52f06bd4af008225c65a3426c9e3b0527f49;hp=42b3bfbf8a6d091394e881876c35f2f98767abe1;hb=88f2331a5a94726bdf7c2b28f11c203a3f9ae15a;hpb=aeb663e7da50cdad5b60dcb5d7a1f78fc83f36b4 diff --git a/scan2pages.sh b/scan2pages.sh index 42b3bfb..cbfe52f 100755 --- a/scan2pages.sh +++ b/scan2pages.sh @@ -128,14 +128,23 @@ for scanpgnum in `$my_seq 1 $pages`; do done; # do ocr binarise -[ -e $outdir/scanpgs ] && rm -r $outdir/scanpgs -ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1 +if [ -e $outdir/scanpgs ]; then + #assume (for the sake of speed in repeated runs) that + # if the last file exists, the process completed successfully + if [ -e $outdir/scanpgs/$(printf '%04d' $pages).bin.png ]; then + [[ $verbose ]] && echo Binarisation already complete + else + rm -r $outdir/scanpgs + fi +fi +[ -e $outdir/scanpgs ] || ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1 +# process each binarised scan page for scanpgnum in `$my_seq 1 $pages`; do + binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png scanpgnum=$(printf '%03d' $scanpgnum) scanpg=$outdir/scanpg-${scanpgnum}.png - binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png # preprocess scanned page cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm @@ -162,7 +171,11 @@ for scanpgnum in `$my_seq 1 $pages`; do #unpaper it physpgbase=$outdir/physpg-${scanpgnum} if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then - unpaper $unpaperflags --layout double --overwrite -ni 10 -op 2 $cleanscanpg ${physpgbase}-%01d.pnm || exit 1 + # most of unpaper's processing is redundant given ocropus, and somewhat too agressive + unpaper $unpaperflags --layout double --overwrite -op 2 \ + --no-processing \ + $cleanscanpg ${physpgbase}-%01d.pnm || exit 1 + fi; for physpgnum in $($my_seq 1 2); do @@ -197,18 +210,28 @@ for scanpgnum in `$my_seq 1 $pages`; do cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm fi; - + #prepare for ocr + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1 + #final convert and clean w/ bebook optimisation if [[ $bebook ]]; then - convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 else convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 fi - + done; done +#try full ocr +rm -rf $outdir/logpgs +ocropus book2pages $outdir/logpgs $outdir/logpg-*.png +ocropus pages2lines $outdir/logpgs +ocropus lines2fsts $outdir/logpgs +ocropus fsts2bestpaths $outdir/logpgs +ocropus buildhtml $outdir/logpgs > $outdir/out.html + mkdir -p $outdir/pages mv $outdir/final-*.${extension} $outdir/pages