X-Git-Url: https://git.ucc.asn.au/?p=dja%2Fscandal.git;a=blobdiff_plain;f=scan2pages.sh;h=069b4211dcbe997f259f69939ddd5162eda67f1f;hp=42b3bfbf8a6d091394e881876c35f2f98767abe1;hb=bdb5f5eb47668416dcd60d03f807b38e963a1501;hpb=aeb663e7da50cdad5b60dcb5d7a1f78fc83f36b4 diff --git a/scan2pages.sh b/scan2pages.sh index 42b3bfb..069b421 100755 --- a/scan2pages.sh +++ b/scan2pages.sh @@ -133,9 +133,9 @@ ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1 for scanpgnum in `$my_seq 1 $pages`; do + binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png scanpgnum=$(printf '%03d' $scanpgnum) scanpg=$outdir/scanpg-${scanpgnum}.png - binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png # preprocess scanned page cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm @@ -162,7 +162,11 @@ for scanpgnum in `$my_seq 1 $pages`; do #unpaper it physpgbase=$outdir/physpg-${scanpgnum} if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then - unpaper $unpaperflags --layout double --overwrite -ni 10 -op 2 $cleanscanpg ${physpgbase}-%01d.pnm || exit 1 + # most of unpaper's processing is redundant given ocropus, and somewhat too agressive + unpaper $unpaperflags --layout double --overwrite -op 2 \ + --no-processing \ + $cleanscanpg ${physpgbase}-%01d.pnm || exit 1 + fi; for physpgnum in $($my_seq 1 2); do @@ -197,18 +201,28 @@ for scanpgnum in `$my_seq 1 $pages`; do cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm fi; - + #prepare for ocr + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1 + #final convert and clean w/ bebook optimisation if [[ $bebook ]]; then - convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 else convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 fi - + done; done +#try full ocr +rm -rf $outdir/logpgs +ocropus book2pages $outdir/logpgs $outdir/logpg-*.png +ocropus pages2lines $outdir/logpgs +ocropus lines2fsts $outdir/logpgs +ocropus fsts2bestpaths $outdir/logpgs +ocropus buildhtml $outdir/logpgs > $outdir/out.html + mkdir -p $outdir/pages mv $outdir/final-*.${extension} $outdir/pages