[ocrfeeder] Clean recognized text before returning it
- From: Joaquim Manuel Pereira Rocha <jrocha src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [ocrfeeder] Clean recognized text before returning it
- Date: Tue, 27 Jul 2010 22:08:02 +0000 (UTC)
commit 57be6850d909e0cfcc8a8d19e8eea1dae772c9fd
Author: Joaquim Rocha <jrocha igalia com>
Date: Thu Jul 8 17:22:07 2010 +0200
Clean recognized text before returning it
layoutAnalysis.LayoutAnalysis: Remove lines breaks in an attempt to
make the text more close to the original in the image since OCR
engines output the text line-by-line with a newline char after every
line.
feeder/layoutAnalysis.py | 10 +++++++++-
1 files changed, 9 insertions(+), 1 deletions(-)
---
diff --git a/feeder/layoutAnalysis.py b/feeder/layoutAnalysis.py
index 63b827e..6b8d2ce 100644
--- a/feeder/layoutAnalysis.py
+++ b/feeder/layoutAnalysis.py
@@ -23,6 +23,7 @@ from util import graphics
from util.constants import OCRFEEDER_DEBUG, DTP
from studio.dataHolder import DataBox
from imageManipulation import ImageProcessor
+import re
NONE = 0
TOP = -1
@@ -456,4 +457,11 @@ class LayoutAnalysis(object):
def readImage(self, image):
self.ocr_engine.setImage(image)
- return self.ocr_engine.read()
+ text = self.ocr_engine.read()
+ text = self.__cleanText(text)
+ return text
+
+ def __cleanText(self, text):
+ clean_text = re.sub(r'(?<!-)-\n(?!\n)', r'', text)
+ clean_text = re.sub(r'(?<!\n)\n', r' ', clean_text)
+ return clean_text
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]