[tracker/wip/sam/resource-jsonld: 6/6] WORK IN PROGRESS converting 400 extractor tests
- From: Sam Thursfield <sthursfield src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/wip/sam/resource-jsonld: 6/6] WORK IN PROGRESS converting 400 extractor tests
- Date: Thu, 7 Jul 2016 00:00:08 +0000 (UTC)
commit 661f2e377309b523f7406ae86d8f21b378f886a7
Author: Sam Thursfield <sam afuera me uk>
Date: Thu Jul 7 00:59:15 2016 +0100
WORK IN PROGRESS converting 400 extractor tests
Ignore this commit (or finish it off for me ... :-)
tests/functional-tests/400-extractor.py | 7 +-
tests/functional-tests/common/utils/extractor.py | 251 ++------------------
.../test-extraction-data/office/pdf-doc.expected | 18 +-
.../playlists/playlist-test-1.expected | 29 ++-
4 files changed, 47 insertions(+), 258 deletions(-)
---
diff --git a/tests/functional-tests/400-extractor.py b/tests/functional-tests/400-extractor.py
index 140b5d5..4fd3b8b 100755
--- a/tests/functional-tests/400-extractor.py
+++ b/tests/functional-tests/400-extractor.py
@@ -1,6 +1,7 @@
#!/usr/bin/python
#
# Copyright (C) 2010, Nokia <ivan frade nokia com>
+# Copyright (C) 2016, Sam Thursfield <sam afuera me uk>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -24,7 +25,7 @@ directory (containing xxx.expected files)
"""
from common.utils import configuration as cfg
-from common.utils.extractor import get_tracker_extract_output
+from common.utils.extractor import get_tracker_extract_jsonld_output
import unittest2 as ut
import os
import sys
@@ -112,7 +113,7 @@ class ExtractionTestCase (ut.TestCase):
filename_to_extract = self.configParser.get ("TestFile", "Filename")
self.file_to_extract = os.path.join (desc_root, filename_to_extract)
- result = get_tracker_extract_output(self.file_to_extract)
+ result = get_tracker_extract_jsonld_output(self.file_to_extract)
self.__assert_extraction_ok (result)
def assertDictHasKey (self, d, key, msg=None):
@@ -156,6 +157,8 @@ class ExtractionTestCase (ut.TestCase):
unexpected_pairs.append ( (k[1:].replace ("_", ":"), v) )
elif k.startswith ("@"):
expected_keys.append ( k[1:].replace ("_", ":") )
+ elif k == 'a':
+ expected_keys.append ( '@type' )
else:
expected_pairs.append ( (k.replace ("_", ":"), v) )
diff --git a/tests/functional-tests/common/utils/extractor.py
b/tests/functional-tests/common/utils/extractor.py
index 8dd0560..7ca5470 100644
--- a/tests/functional-tests/common/utils/extractor.py
+++ b/tests/functional-tests/common/utils/extractor.py
@@ -1,6 +1,7 @@
#!/usr/bin/python
#
# Copyright (C) 2010, Nokia <ivan frade nokia com>
+# Copyright (C) 2016, Sam Thursfield <sam afuera me uk>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -20,246 +21,19 @@
from common.utils import configuration as cfg
from common.utils.helpers import log
+
+import json
import os
-import re
import subprocess
-class ExtractorParser(object):
- def parse_tracker_extract_output(self, text):
- """
- Parse stdout of `tracker-extract --file` to get SPARQL statements.
-
- Calls the extractor a returns a dictionary of property, value.
-
- Example:
- { 'nie:filename': 'a.jpeg' ,
- 'tracker:added': '2008-12-12T12:23:34Z'
- }
- """
-
- metadata = {}
- parts = self.get_statements_from_stdout_output(text)
- extras = self.__process_where_part(parts['where'])
- for attribute_value in self.__process_lines(parts['item']):
- att, value = attribute_value.split(" ", 1)
- if value.startswith("?") and extras.has_key(value):
- value = extras[value]
-
- if metadata.has_key(att):
- metadata [att].append(value)
- else:
- metadata [att] = [value]
-
- return metadata
-
- def get_statements_from_stdout_output(self, text):
- lines = text.split('\n')
- parts = {}
-
- current_part = None
- part_start = None
-
- i = 0
- for i in range(0, len(lines)):
- if lines[i] == 'SPARQL pre-update:':
- current_part = 'preupdate'
- elif lines[i] == 'SPARQL item:':
- current_part = 'item'
- elif lines[i] == 'SPARQL where clause:':
- current_part = 'where'
- elif lines[i] == 'SPARQL post-update:':
- current_part = 'postupdate'
-
- if lines[i] == '--':
- if part_start is None:
- part_start = i + 1
- else:
- part_lines = lines[part_start:i]
- parts[current_part] = '\n'.join(part_lines)
- current_part = None
- part_start = None
-
- if current_part is not None:
- raise Exception("End of text while parsing %s in tracker-extract "
- "output" % current_part)
-
- if len(parts) == 0:
- raise Exception("No metadata was found by tracker-extract")
-
- return parts
-
- def __process_lines(self, embedded):
- """
- Translate each line in a "prop value" string, handling anonymous nodes.
-
- Example:
- nfo:width 699 ; -> 'nfo:width 699'
- or
- nao:hasTag [ a nao:Tag ;
- nao:prefLabel "tracker"] ; -> nao:hasTag:prefLabel 'tracker'
-
- Would be so cool to implement this with yield and generators... :)
- """
- grouped_lines = []
- current_line = ""
- anon_node_open = False
- for l in embedded.split ("\n\t"):
- if "[" in l:
- current_line = current_line + l
- anon_node_open = True
- continue
-
- if "]" in l:
- anon_node_open = False
- current_line += l
- final_lines = self.__handle_anon_nodes (current_line.strip ())
- grouped_lines = grouped_lines + final_lines
- current_line = ""
- continue
-
- if anon_node_open:
- current_line += l
- else:
- if (len (l.strip ()) == 0):
- continue
-
- final_lines = self.__handle_multivalues (l.strip ())
- grouped_lines = grouped_lines + final_lines
-
- return map (self.__clean_value, grouped_lines)
-
- def __process_where_part(self, where):
- gettags = re.compile ("(\?\w+)\ a\ nao:Tag\ ;\ nao:prefLabel\ \"([\w\ -]+)\"")
- tags = {}
- for l in where.split ("\n"):
- if len (l) == 0:
- continue
- match = gettags.search (l)
- if (match):
- tags [match.group(1)] = match.group (2)
- else:
- print "This line is not a tag:", l
-
- return tags
-
- def __handle_multivalues(self, line):
- """
- Split multivalues like:
- a nfo:Image, nmm:Photo ;
- -> a nfo:Image ;
- -> a nmm:Photo ;
- """
- hasEscapedComma = re.compile ("\".+,.+\"")
-
- if "," in line and not hasEscapedComma.search (line):
- prop, multival = line.split (" ", 1)
- results = []
- for value in multival.split (","):
- results.append ("%s %s" % (prop, value.strip ()))
- return results
- else:
- return [line]
-
- def __handle_anon_nodes(self, line):
- """
- Traslates anonymous nodes in 'flat' properties:
-
- nao:hasTag [a nao:Tag; nao:prefLabel "xxx"]
- -> nao:hasTag:prefLabel "xxx"
-
- slo:location [a slo:GeoLocation; slo:postalAddress <urn:uuid:1231-123> .]
- -> slo:location <urn:uuid:1231-123>
-
- nfo:hasMediaFileListEntry [ a nfo:MediaFileListEntry ; nfo:entryUrl "file://x.mp3"; nfo:listPosition
1]
- -> nfo:hasMediaFileListEntry:entryUrl "file://x.mp3"
-
- """
-
- # hasTag case
- if line.startswith ("nao:hasTag"):
- getlabel = re.compile ("nao:prefLabel\ \"([\w\ -]+)\"")
- match = getlabel.search (line)
- if (match):
- line = 'nao:hasTag:prefLabel "%s" ;' % (match.group(1))
- return [line]
- else:
- print "Whats wrong on line", line, "?"
- return [line]
-
- # location case
- elif line.startswith ("slo:location"):
- results = []
-
- # Can have country AND/OR city
- getpa = re.compile ("slo:postalAddress\ \<([\w:-]+)\>")
- pa_match = getpa.search (line)
-
- if (pa_match):
- results.append ('slo:location:postalAddress "%s" ;' % (pa_match.group(1)))
- else:
- print "FIXME another location subproperty in ", line
-
- return results
- elif line.startswith ("nco:creator"):
- getcreator = re.compile ("nco:fullname\ \"([\w\ ]+)\"")
- creator_match = getcreator.search (line)
-
- if (creator_match):
- new_line = 'nco:creator:fullname "%s" ;' % (creator_match.group (1))
- return [new_line]
- else:
- print "Something special in this line '%s'" % (line)
-
- elif line.startswith ("nfo:hasMediaFileListEntry"):
- return self.__handle_playlist_entries (line)
-
- else:
- return [line]
-
- def __handle_playlist_entries(self, line):
- """
- Playlist entries come in one big line:
- nfo:hMFLE [ a nfo:MFLE; nfo:entryUrl '...'; nfo:listPosition X] , [ ... ], [ ... ]
- -> nfo:hMFLE:entryUrl '...'
- -> nfo:hMFLE:entryUrl '...'
- ...
- """
- geturl = re.compile ("nfo:entryUrl \"([\w\.\:\/]+)\"")
- entries = line.strip () [len ("nfo:hasMediaFileListEntry"):]
- results = []
- for entry in entries.split (","):
- url_match = geturl.search (entry)
- if (url_match):
- new_line = 'nfo:hasMediaFileListEntry:entryUrl "%s" ;' % (url_match.group (1))
- results.append (new_line)
- else:
- print " *** Something special in this line '%s'" % (entry)
- return results
-
- def __clean_value(self, value):
- """
- the value comes with a ';' or a '.' at the end
- """
- if (len (value) < 2):
- return value.strip ()
-
- clean = value.strip ()
- if value[-1] in [';', '.']:
- clean = value [:-1]
-
- clean = clean.replace ("\"", "")
-
- return clean.strip ()
-
-
-def get_tracker_extract_output(filename, mime_type=None):
+def get_tracker_extract_jsonld_output(filename, mime_type=None):
"""
Runs `tracker-extract --file` to extract metadata from a file.
"""
- tracker_extract = os.path.join (cfg.EXEC_PREFIX, 'tracker-extract')
- command = [tracker_extract, '--file', filename]
+ tracker = os.path.join (cfg.BINDIR, 'tracker')
+ command = [tracker, 'extract', '--verbosity=errors', '--output-format=json-ld', filename]
if mime_type is not None:
command.extend(['--mime', mime_type])
@@ -267,8 +41,13 @@ def get_tracker_extract_output(filename, mime_type=None):
log ('Running: %s' % ' '.join(command))
output = subprocess.check_output (command)
except subprocess.CalledProcessError as e:
- raise Exception("Error %i from tracker-extract, output: %s" %
- (e.returncode, e.output))
+ raise RuntimeError("Error %i from tracker-extract, see stderr for "
+ "details" % e.returncode)
+
+ try:
+ data = json.loads (output)
+ except ValueError as e:
+ raise RuntimeError("Invalid JSON returned by tracker-extract: "
+ "%s.\nOutput was: %s" % (e, output))
- parser = ExtractorParser()
- return parser.parse_tracker_extract_output(output)
+ return data
diff --git a/tests/functional-tests/test-extraction-data/office/pdf-doc.expected
b/tests/functional-tests/test-extraction-data/office/pdf-doc.expected
index 93de5b7..6f16a17 100644
--- a/tests/functional-tests/test-extraction-data/office/pdf-doc.expected
+++ b/tests/functional-tests/test-extraction-data/office/pdf-doc.expected
@@ -1,9 +1,11 @@
-[TestFile]
-Filename=pdf-doc.pdf
-Comment=PDF document from the office tools
+{
+ "test": {
+ "Filename": "pdf-doc.pdf",
+ "Comment": "PDF document from the office tools"
+ },
-[Metadata]
-
-[Meego]
-a=nfo:PaginatedTextDocument
-nfo_pageCount=22
+ "metadata": {
+ "@type": "nfo:PaginatedTextDocument",
+ "nfo:pageCount": "22"
+ }
+}
diff --git a/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected
b/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected
index fd822f8..6a26051 100644
--- a/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected
+++ b/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected
@@ -1,13 +1,18 @@
-[TestFile]
-Filename=playlist-test-1.m3u
-Bugzilla=
-Comment=Regular m3u playlist file
+{
+ "test": {
+ "Filename": "playlist-test-1.m3u",
+ "Comment": "Regular m3u playlist file"
+ }
-[Metadata]
-a=nmm:Playlist
-nfo_entryCounter=5
-nfo_hasMediaFileListEntry_entryUrl=http://www.apnaradio.com/live/ApnaRadio.mp3
-nfo_hasMediaFileListEntry_entryUrl=http://live.apnaradio.com:6464
-nfo_hasMediaFileListEntry_entryUrl=http://live.apnaradio.com:2424
-nfo_hasMediaFileListEntry_entryUrl=http://www.apnaradio.com/live/MaintenanceE.mp3
-nfo_hasMediaFileListEntry_entryUrl=http://www.apnaradio.com/live/MaintenanceP.mp3
+ "metadata": {
+ "@type": "nmm:Playlist",
+ "nfo:entryCounter": "5",
+ "nfo:hasMediaFileListEntry": [
+ { "nfo:entryUrl": "http://www.apnaradio.com/live/ApnaRadio.mp3" },
+ { "nfo:entryUrl": "http://live.apnaradio.com:6464" },
+ { "nfo:entryUrl": "http://live.apnaradio.com:2424" },
+ { "nfo:entryUrl": "http://www.apnaradio.com/live/MaintenanceE.mp3" },
+ { "nfo:entryUrl": "http://www.apnaradio.com/live/MaintenanceP.mp3" }
+ ]
+ }
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]