[ocrfeeder] Handle special chars in filenames when importing PDF files
- From: Joaquim Rocha <jrocha src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [ocrfeeder] Handle special chars in filenames when importing PDF files
- Date: Sun, 8 Mar 2020 20:55:06 +0000 (UTC)
commit 5286120c8bc8b7ba74e0f9b19b5262b509f38cee
Author: scx <scx mail gmail com>
Date: Sun Mar 8 21:36:57 2020 +0100
Handle special chars in filenames when importing PDF files
Some special characters (e.g. quotes) in the filename cause gs to fail.
What's worse, gs interprets the escape character as a real character.
This means that it cannot handle all Unix files on its own.
We need to create a temp symlink as a workaround for gs limitations.
Fixes GNOME/ocrfeeder#20
src/ocrfeeder/util/lib.py | 67 ++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 57 insertions(+), 10 deletions(-)
---
diff --git a/src/ocrfeeder/util/lib.py b/src/ocrfeeder/util/lib.py
index bf3c0aa..71a2965 100644
--- a/src/ocrfeeder/util/lib.py
+++ b/src/ocrfeeder/util/lib.py
@@ -28,6 +28,7 @@ from .constants import *
import sane
import tempfile
import locale
+import re
import xml.etree.ElementTree as etree
from .log import debug
@@ -43,20 +44,66 @@ def getIconOrLabel(icon_name, label_text, icon_size = Gtk.IconSize.SMALL_TOOLBAR
label = None
return icon, label
+def getSafeGhostscriptPath(file_path):
+ return re.sub(r'[^\w !#$%&()*+,./:;<=>?@\[\\\]^_`{|}~-]', '_', file_path)
+
+def getSafeGhostscriptInputFilename(file_name):
+ return re.sub(r'[/]', '_', getSafeGhostscriptPath(file_name))
+
+def getSafeGhostscriptOutputBasename(file_name):
+ return re.sub(r'[%]', '_', getSafeGhostscriptInputFilename(file_name))
+
def convertPdfToImages(pdf_file, temp_dir = '/tmp'):
- dir_name = tempfile.mkdtemp(dir = temp_dir)
+ if not os.path.isfile(pdf_file):
+ debug('Unable to convert PDF: File does not exist: %s', pdf_file)
+ return None
+ try:
+ dir_name = tempfile.mkdtemp(dir = temp_dir)
+ except:
+ debug('Unable to convert PDF: Cannot create temp dir in: %s', temp_dir)
+ return None
+
debug('Converting PDF: %s to image', pdf_file)
+
+ file_name = os.path.basename(pdf_file)
+ base_name = os.path.splitext(file_name)[0]
+ pdf_path = pdf_file
+ file_name_safe = getSafeGhostscriptInputFilename(file_name)
+ base_name_safe = getSafeGhostscriptOutputBasename(base_name)
+ pdf_file_safe = getSafeGhostscriptPath(pdf_file)
+
+ if pdf_file != pdf_file_safe:
+ try:
+ pdf_path_safe = os.path.join(dir_name, file_name_safe)
+ os.symlink(pdf_file, pdf_path_safe)
+ except:
+ debug('Unable to convert PDF: Cannot create temp symlink in: %s', dir_name)
+ return None
+
+ runGhostscript(dir_name, base_name_safe, pdf_path_safe)
+ try:
+ os.unlink(pdf_path_safe)
+ except:
+ debug('PDF conversion warning: Cannot remove temp symlink: %s', pdf_path_safe)
+ else:
+ runGhostscript(dir_name, base_name_safe, pdf_path)
+
+ return dir_name
+
+def runGhostscript(dir_name, base_name, pdf_path):
+ format='jpeg'
resolution = 300
- file_name = os.path.splitext(os.path.basename(pdf_file))[0]
- command = 'gs -SDEVICE=jpeg -r%(resolution)sx%(resolution)s -sPAPERSIZE=letter ' \
- '-sOutputFile="%(temp_name)s/%(file_name)s_%%04d.jpg" ' \
- '-dNOPAUSE -dBATCH -- "%(pdf_file)s"' % \
- {'temp_name': dir_name,
- 'file_name': file_name,
- 'pdf_file': pdf_file,
- 'resolution': resolution}
+ size = 'letter'
+ command = 'gs -SDEVICE=%(format)s -r%(resolution)sx%(resolution)s -sPAPERSIZE=%(size)s ' \
+ '-sOutputFile=\'%(temp_name)s/%(file_name)s_%%04d.jpg\' ' \
+ '-dNOPAUSE -dBATCH -- \'%(pdf_file)s\'' % \
+ {'format': format,
+ 'temp_name': dir_name,
+ 'file_name': base_name,
+ 'pdf_file': pdf_path,
+ 'resolution': resolution,
+ 'size': size}
process = subprocess.run(command, shell=True)
- return dir_name
def getImagesFromFolder(folder):
if not os.path.isdir(folder):
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]