From: Daniel Axtens <dja@ucc.gu.uwa.edu.au>
Date: Fri, 5 Aug 2011 12:45:56 +0000 (+0800)
Subject: Update docs, various fixes.
X-Git-Url: https://git.ucc.asn.au/?p=dja%2Fscandal.git;a=commitdiff_plain;h=88d39ee082b1599229efe259f04579e34a13a395;hp=aeb663e7da50cdad5b60dcb5d7a1f78fc83f36b4

Update docs, various fixes.
---

diff --git a/architecture.txt b/architecture.txt
index 95251d3..5646a24 100644
--- a/architecture.txt
+++ b/architecture.txt
@@ -12,17 +12,19 @@ each physical page may contain either 2 or 1 logical pages
 3. determine dpi
 4. foreach double-page-spread (scan page)
 	4.1. extract scan page from pdf, save as png
-	4.2. run a mask over it to pull off large black areas
-	4.3. run unpaper over it, creating 2 pages (physical page)
-	4.4. foreach physical page
-		4.4.1. remask and retrim
-		4.4.2. attempt to detect if a physical page contains 2 logical pages, 
-			4.4.2.1. if so split with unpaper
-		4.4.3. do any final processing (resize for bebook)
-5. move all the final pictures into a final picture directory
 
-In the accidentally deleted code we used ocropus's binarise stuff to do some
-extra cleaning.
+5. run ocropus's binarise over all the pngs
+
+6. foreach binarised scan page
+	6.1. create a mask from the original (unbinarised) page
+	6.2. use the mask to trim the binarised page (cutting this off improves unpaper's accuracy)
+	6.3. run unpaper over the clean binarised page, creating 2 pages (physical page)
+	6.4. foreach physical page
+		6.4.1. remask and retrim
+		6.4.2. attempt to detect if a physical page contains 2 logical pages, 
+			6.4.2.1. if so split with unpaper
+		6.4.3. do any final processing (resize for bebook)
+7. move all the final pictures into a final picture directory
 
 = What options do we need? =
 Anything we attempt to detect automatically should have the option to set manually
diff --git a/scan2pages.sh b/scan2pages.sh
index 42b3bfb..dbd66ee 100755
--- a/scan2pages.sh
+++ b/scan2pages.sh
@@ -133,9 +133,9 @@ ocropus book2pages $outdir/scanpgs $outdir/scanpg-*.png || exit 1
 
 for scanpgnum in  `$my_seq 1 $pages`; do
 
+	binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
 	scanpgnum=$(printf '%03d' $scanpgnum)
 	scanpg=$outdir/scanpg-${scanpgnum}.png
-	binscanpg=$outdir/scanpgs/$(printf '%04d' $scanpgnum).bin.png
 
 	# preprocess scanned page
 	cleanscanpg=$outdir/scanpg-clean-${scanpgnum}.pnm
@@ -197,18 +197,28 @@ for scanpgnum in  `$my_seq 1 $pages`; do
 			cp $cleanphyspg $outdir/logpg-${scanpgnum}-${physpgnum}-1.pnm
 		fi;
 
-	
+		#prepare for ocr
+		convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/logpg-${scanpgnum}-${physpgnum}-%01d.png || exit 1
+
 		#final convert and clean w/ bebook optimisation
 		if [[ $bebook ]]; then
-			convert $convertflags -colorspace Gray -median 1 $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
+			convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm -trim -fuzz 80% -resize 1200x1600 $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
 		else
 			convert $convertflags $outdir/logpg-${scanpgnum}-${physpgnum}-?.pnm $outdir/final-${scanpgnum}-${physpgnum}-%01d.${extension} || exit 1
 		fi
-	
+
 	done;
 
 done
 
+#try full ocr
+rm -rf $outdir/logpgs
+ocropus book2pages $outdir/logpgs $outdir/logpg-*.png
+ocropus pages2lines $outdir/logpgs
+ocropus lines2fsts $outdir/logpgs
+ocropus fsts2bestpaths $outdir/logpgs
+ocropus buildhtml $outdir/logpgs > $outdir/out.html
+
 mkdir -p $outdir/pages
 mv $outdir/final-*.${extension} $outdir/pages