3. determine dpi
4. foreach double-page-spread (scan page)
4.1. extract scan page from pdf, save as png
- 4.2. run a mask over it to pull off large black areas
- 4.3. run unpaper over it, creating 2 pages (physical page)
- 4.4. foreach physical page
- 4.4.1. remask and retrim
- 4.4.2. attempt to detect if a physical page contains 2 logical pages,
- 4.4.2.1. if so split with unpaper
- 4.4.3. do any final processing (resize for bebook)
-5. move all the final pictures into a final picture directory
-In the accidentally deleted code we used ocropus's binarise stuff to do some
-extra cleaning.
+5. run ocropus's binarise over all the pngs
+
+6. foreach binarised scan page
+ 6.1. create a mask from the original (unbinarised) page
+ 6.2. use the mask to trim the binarised page (cutting this off improves unpaper's accuracy)
+ 6.3. run unpaper over the clean binarised page, creating 2 pages (physical page)
+ 6.4. foreach physical page
+ 6.4.1. remask and retrim
+ 6.4.2. attempt to detect if a physical page contains 2 logical pages,
+ 6.4.2.1. if so split with unpaper
+ 6.4.3. do any final processing (resize for bebook)
+7. move all the final pictures into a final picture directory
= What options do we need? =
Anything we attempt to detect automatically should have the option to set manually
for scanpgnum in `$my_seq 1 $pages`; do
+ binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
scanpgnum=$(printf '%03d' $scanpgnum)
scanpg=$outdir/scanpg-${scanpgnum}.png
- binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
# preprocess scanned page
cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm
cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm
fi;
-
+ #prepare for ocr
+ convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1
+
#final convert and clean w/ bebook optimisation
if [[ $bebook ]]; then
- convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
+ convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
else
convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
fi
-
+
done;
done
+#try full ocr
+rm -rf $outdir/logpgs
+ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
+ocropus pages2lines $outdir/logpgs
+ocropus lines2fsts $outdir/logpgs
+ocropus fsts2bestpaths $outdir/logpgs
+ocropus buildhtml $outdir/logpgs > $outdir/out.html
+
mkdir -p $outdir/pages
mv $outdir/final-*.${extension} $outdir/pages