X-Git-Url: https://git.ucc.asn.au/?p=dja%2Fscandal.git;a=blobdiff_plain;f=scan2pages.sh;h=dbd66eea3ce681a831ce08aaf154e59b384a4b72;hp=be898707f5f8868d3effd73adaaf2b65e7075edd;hb=88d39ee082b1599229efe259f04579e34a13a395;hpb=8f395874140a4c6992c80fb904096e11231b4166;ds=sidebyside diff --git a/scan2pages.sh b/scan2pages.sh index be89870..dbd66ee 100755 --- a/scan2pages.sh +++ b/scan2pages.sh @@ -125,7 +125,19 @@ for scanpgnum in `$my_seq 1 $pages`; do $scanpg || exit 1 fi; - # preprocess it! +done; + +# do ocr binarise +[ -e $outdir/scanpgs ] && rm -r $outdir/scanpgs +ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1 + +for scanpgnum in `$my_seq 1 $pages`; do + + binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png + scanpgnum=$(printf '%03d' $scanpgnum) + scanpg=$outdir/scanpg-${scanpgnum}.png + + # preprocess scanned page cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm if [ ! -e $cleanscanpg ]; then # create mask: @@ -137,9 +149,7 @@ for scanpgnum in `$my_seq 1 $pages`; do cropcords=$(convert -border 1x1 -bordercolor '#000' -resize 1000% -trim -fuzz 90% -format "%wx%h%O" $outdir/scanpg-mask-${scanpgnum}.png info: || exit 1) # ... crop and despeckle? the final pre-prepared image - convert $convertflags -crop $cropcords $scanpg $cleanscanpg || exit 1 - elif [[ $skipmask ]]; then - cp $origfile $preppnm + convert $convertflags -crop $cropcords $binscanpg $cleanscanpg || exit 1 fi; # check it hasn't mostly disappeared - e.g. if the scan was all black @@ -177,29 +187,38 @@ for scanpgnum in `$my_seq 1 $pages`; do #detect if the page is 2-up if [[ $logperphys == 2 ]] || ( [[ $logperphys != 1 ]] && $(dirname $0)/detect2pages.sh ${cleanphyspg} ${scanpgnum} ${physpgnum} ); then - if [ ! -e ${physpgbase}-1.pnm ] || [ ! -e ${physpgbase}-2.pnm ]; then if [[ $verbose == 1 ]]; then echo "Resplitting physical page ${physpgnum}." fi unpaper $unpaperflags --pre-rotate 90 --layout double --overwrite \ - -op 2 --no-blackfilter --no-greyfilter --no-noisefilter \ - -no-blurfilter $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.pnm || exit 1 + -op 2 --no-blackfilter --no-grayfilter --no-noisefilter \ + --no-blurfilter $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.pnm || exit 1 else cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm fi; - + #prepare for ocr + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1 + #final convert and clean w/ bebook optimisation if [[ $bebook ]]; then - convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 + convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 else convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1 fi - + done; done +#try full ocr +rm -rf $outdir/logpgs +ocropus book2pages $outdir/logpgs $outdir/logpg-*.png +ocropus pages2lines $outdir/logpgs +ocropus lines2fsts $outdir/logpgs +ocropus fsts2bestpaths $outdir/logpgs +ocropus buildhtml $outdir/logpgs > $outdir/out.html + mkdir -p $outdir/pages mv $outdir/final-*.${extension} $outdir/pages