$scanpg || exit 1
fi;
- # preprocess it!
+done;
+
+# do ocr binarise
+[ -e $outdir/scanpgs ] && rm -r $outdir/scanpgs
+ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1
+
+for scanpgnum in `$my_seq 1 $pages`; do
+
+ binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
+ scanpgnum=$(printf '%03d' $scanpgnum)
+ scanpg=$outdir/scanpg-${scanpgnum}.png
+
+ # preprocess scanned page
cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm
if [ ! -e $cleanscanpg ]; then
# create mask:
cropcords=$(convert -border 1x1 -bordercolor '#000' -resize 1000% -trim -fuzz 90% -format "%wx%h%O" $outdir/scanpg-mask-${scanpgnum}.png info: || exit 1)
# ... crop and despeckle? the final pre-prepared image
- convert $convertflags -crop $cropcords $scanpg $cleanscanpg || exit 1
- elif [[ $skipmask ]]; then
- cp $origfile $preppnm
+ convert $convertflags -crop $cropcords $binscanpg $cleanscanpg || exit 1
fi;
# check it hasn't mostly disappeared - e.g. if the scan was all black
echo "Resplitting physical page ${physpgnum}."
fi
unpaper $unpaperflags --pre-rotate 90 --layout double --overwrite \
- -op 2 --no-blackfilter --no-greyfilter --no-noisefilter \
- -no-blurfilter $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.pnm || exit 1
+ -op 2 --no-blackfilter --no-grayfilter --no-noisefilter \
+ --no-blurfilter $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.pnm || exit 1
else
cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm
fi;
-
+ #prepare for ocr
+ convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1
+
#final convert and clean w/ bebook optimisation
if [[ $bebook ]]; then
- convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
+ convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
else
convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
fi
-
+
done;
done
+#try full ocr
+rm -rf $outdir/logpgs
+ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
+ocropus pages2lines $outdir/logpgs
+ocropus lines2fsts $outdir/logpgs
+ocropus fsts2bestpaths $outdir/logpgs
+ocropus buildhtml $outdir/logpgs > $outdir/out.html
+
mkdir -p $outdir/pages
mv $outdir/final-*.${extension} $outdir/pages