[gnome-documents] miners: avoid DB updates if mtime didn't change



commit 452acf9d4cff55c026b5ecaa6a9ff0987471cd36
Author: Cosimo Cecchi <cosimoc gnome org>
Date:   Mon Oct 1 19:59:09 2012 -0400

    miners: avoid DB updates if mtime didn't change
    
    When we mine remote entries, if their modification time didn't change,
    avoid re-querying and setting to Tracker all the other properties, since
    it's an expensive operation.
    This makes the mining operation much faster.

 src/Makefile-miner.am        |    1 +
 src/miner/gd-gdata-miner.c   |   36 +++++++-----
 src/miner/gd-miner-tracker.c |  134 ++++++++++++++++++++++++++++++++++++++++++
 src/miner/gd-miner-tracker.h |   16 +++++
 src/miner/gd-zpj-miner.c     |   39 +++++++-----
 5 files changed, 195 insertions(+), 31 deletions(-)
---
diff --git a/src/Makefile-miner.am b/src/Makefile-miner.am
index f0f4615..31a94c9 100644
--- a/src/Makefile-miner.am
+++ b/src/Makefile-miner.am
@@ -11,6 +11,7 @@ gdminer_source_c = \
 libgdminer_1_0_la_CPPFLAGS = \
     -DG_LOG_DOMAIN=\"Gdminer\" \
     -DG_DISABLE_DEPRECATED \
+    -I$(top_srcdir)/src/lib \
     $(DOCUMENTS_CFLAGS) \
     $(NULL)
 
diff --git a/src/miner/gd-gdata-miner.c b/src/miner/gd-gdata-miner.c
index d747ecb..e145fdf 100644
--- a/src/miner/gd-gdata-miner.c
+++ b/src/miner/gd-gdata-miner.c
@@ -39,6 +39,8 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
   gchar *resource = NULL;
   gchar *date, *resource_url, *identifier;
   const gchar *class = NULL;
+  gboolean mtime_changed, resource_exists;
+  gint64 new_mtime;
 
   GList *authors, *l, *parents = NULL;
   GDataAuthor *author;
@@ -86,20 +88,35 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
   resource = gd_miner_tracker_sparql_connection_ensure_resource
     (job->connection,
      job->cancellable, error,
+     &resource_exists,
      resource_url, identifier,
      "nfo:RemoteDataObject", class, NULL);
 
   if (*error != NULL)
     goto out;
 
-  gd_miner_tracker_sparql_connection_set_triple
-    (job->connection, job->cancellable, error,
-     identifier, resource,
-     "nie:dataSource", job->datasource_urn);
+  gd_miner_tracker_update_datasource (job->connection, job->datasource_urn,
+                                      resource_exists, identifier, resource,
+                                      job->cancellable, error);
+
+  if (*error != NULL)
+    goto out;
+
+  new_mtime = gdata_entry_get_updated (entry);
+  mtime_changed = gd_miner_tracker_update_mtime (job->connection, new_mtime,
+                                                 resource_exists, identifier, resource,
+                                                 job->cancellable, error);
 
   if (*error != NULL)
     goto out;
 
+  /* avoid updating the DB if the entry already exists and has not
+   * been modified since our last run.
+   */
+  if (!mtime_changed)
+    goto out;
+
+  /* the resource changed - just set all the properties again */
   alternate = gdata_entry_look_up_link (entry, GDATA_LINK_ALTERNATE);
   alternate_uri = gdata_link_get_uri (alternate);
 
@@ -258,17 +275,6 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
   if (*error != NULL)
     goto out;
 
-  date = gd_iso8601_from_timestamp (gdata_entry_get_updated (entry));
-  gd_miner_tracker_sparql_connection_insert_or_replace_triple
-    (job->connection,
-     job->cancellable, error,
-     identifier, resource,
-     "nie:contentLastModified", date);
-  g_free (date);
-
-  if (*error != NULL)
-    goto out;
-
  out:
   g_clear_object (&access_rules);
   g_free (resource_url);
diff --git a/src/miner/gd-miner-tracker.c b/src/miner/gd-miner-tracker.c
index 3a94657..75960ca 100644
--- a/src/miner/gd-miner-tracker.c
+++ b/src/miner/gd-miner-tracker.c
@@ -22,6 +22,7 @@
 #include <glib.h>
 
 #include "gd-miner-tracker.h"
+#include "gd-utils.h"
 
 static gchar *
 _tracker_utils_format_into_graph (const gchar *graph)
@@ -29,10 +30,55 @@ _tracker_utils_format_into_graph (const gchar *graph)
   return (graph != NULL) ? g_strdup_printf ("INTO <%s> ", graph) : g_strdup ("");
 }
 
+static gboolean
+gd_miner_tracker_sparql_connection_get_string_attribute (TrackerSparqlConnection *connection,
+                                                         GCancellable *cancellable,
+                                                         GError **error,
+                                                         const gchar *resource,
+                                                         const gchar *attribute,
+                                                         gchar **value)
+{
+  GString *select = g_string_new (NULL);
+  TrackerSparqlCursor *cursor;
+  const gchar *string_value = NULL;
+  gboolean res;
+
+  g_string_append_printf (select, "SELECT ?val { ?urn %s ?val . FILTER (?urn IN (<%s>)) }",
+                          attribute, resource);
+  cursor = tracker_sparql_connection_query (connection,
+                                            select->str,
+                                            cancellable, error);
+  g_string_free (select, TRUE);
+
+  if (*error != NULL)
+    goto out;
+
+  res = tracker_sparql_cursor_next (cursor, cancellable, error);
+
+  if (*error != NULL)
+    goto out;
+
+  if (res)
+    {
+      string_value = tracker_sparql_cursor_get_string (cursor, 0, NULL);
+      goto out;
+    }
+
+ out:
+  if (string_value != NULL && value != NULL)
+    *value = g_strdup (string_value);
+  else if (string_value == NULL)
+    res = FALSE;
+
+  g_clear_object (&cursor);
+  return res;
+}
+
 gchar *
 gd_miner_tracker_sparql_connection_ensure_resource (TrackerSparqlConnection *connection,
                                                     GCancellable *cancellable,
                                                     GError **error,
+                                                    gboolean *resource_exists,
                                                     const gchar *graph,
                                                     const gchar *identifier,
                                                     const gchar *class,
@@ -48,6 +94,7 @@ gd_miner_tracker_sparql_connection_ensure_resource (TrackerSparqlConnection *con
   GVariant *insert_res;
   GVariantIter *iter;
   gchar *key = NULL, *val = NULL;
+  gboolean exists = FALSE;
 
   /* build the inner query with all the classes */
   va_start (args, class);
@@ -83,6 +130,7 @@ gd_miner_tracker_sparql_connection_ensure_resource (TrackerSparqlConnection *con
     {
       /* return the found resource */
       retval = g_strdup (tracker_sparql_cursor_get_string (cursor, 0, NULL));
+      exists = TRUE;
       g_debug ("Found resource in the store: %s", retval);
       goto out;
     }
@@ -127,6 +175,9 @@ gd_miner_tracker_sparql_connection_ensure_resource (TrackerSparqlConnection *con
   g_debug ("Created a new resource: %s", retval);
 
  out:
+  if (resource_exists)
+    *resource_exists = exists;
+
   g_clear_object (&cursor);
   return retval;
 }
@@ -332,3 +383,86 @@ gd_miner_tracker_utils_ensure_contact_resource (TrackerSparqlConnection *connect
 
   return retval;
 }
+
+void
+gd_miner_tracker_update_datasource (TrackerSparqlConnection  *connection,
+                                    const gchar              *datasource_urn,
+                                    gboolean                  resource_exists,
+                                    const gchar              *identifier,
+                                    const gchar              *resource,
+                                    GCancellable             *cancellable,
+                                    GError                  **error)
+{
+  gboolean set_datasource;
+
+  /* only set the datasource again if it has changed; this avoids touching the
+   * DB completely if the entry didn't change at all, since we later also check
+   * the mtime. */
+  set_datasource = TRUE;
+  if (resource_exists)
+    {
+      gboolean res;
+      gchar *old_value;
+
+      res = gd_miner_tracker_sparql_connection_get_string_attribute
+        (connection, cancellable, error,
+         resource, "nie:dataSource", &old_value);
+      g_clear_error (error);
+
+      if (res)
+        {
+          res = g_str_equal (old_value, datasource_urn);
+          g_free (old_value);
+        }
+
+      if (res)
+        set_datasource = FALSE;
+    }
+
+  if (set_datasource)
+    gd_miner_tracker_sparql_connection_set_triple
+      (connection, cancellable, error,
+       identifier, resource,
+       "nie:dataSource", datasource_urn);
+}
+
+gboolean
+gd_miner_tracker_update_mtime (TrackerSparqlConnection  *connection,
+                               gint64                    new_mtime,
+                               gboolean                  resource_exists,
+                               const gchar              *identifier,
+                               const gchar              *resource,
+                               GCancellable             *cancellable,
+                               GError                  **error)
+{
+  GTimeVal old_mtime;
+  gboolean res;
+  gchar *old_value;
+  gchar *date;
+
+  if (resource_exists)
+    {
+      res = gd_miner_tracker_sparql_connection_get_string_attribute
+        (connection, cancellable, error,
+         resource, "nie:contentLastModified", &old_value);
+      g_clear_error (error);
+
+      if (res)
+        {
+          res = g_time_val_from_iso8601 (old_value, &old_mtime);
+          g_free (old_value);
+        }
+
+      if (res && (new_mtime == old_mtime.tv_sec))
+        return FALSE;
+    }
+
+  date = gd_iso8601_from_timestamp (new_mtime);
+  gd_miner_tracker_sparql_connection_insert_or_replace_triple
+    (connection, cancellable, error,
+     identifier, resource,
+     "nie:contentLastModified", date);
+  g_free (date);
+
+  return TRUE;
+}
diff --git a/src/miner/gd-miner-tracker.h b/src/miner/gd-miner-tracker.h
index 878ce85..59e2d70 100644
--- a/src/miner/gd-miner-tracker.h
+++ b/src/miner/gd-miner-tracker.h
@@ -30,6 +30,7 @@ G_BEGIN_DECLS
 gchar *gd_miner_tracker_sparql_connection_ensure_resource (TrackerSparqlConnection *connection,
                                                            GCancellable *cancellable,
                                                            GError **error,
+                                                           gboolean *resource_exists,
                                                            const gchar *graph,
                                                            const gchar *identifier,
                                                            const gchar *class,
@@ -63,6 +64,21 @@ gchar* gd_miner_tracker_utils_ensure_contact_resource (TrackerSparqlConnection *
                                                        const gchar *email,
                                                        const gchar *fullname);
 
+void gd_miner_tracker_update_datasource (TrackerSparqlConnection  *connection,
+                                         const gchar              *datasource_urn,
+                                         gboolean                  resource_exists,
+                                         const gchar              *identifier,
+                                         const gchar              *resource,
+                                         GCancellable             *cancellable,
+                                         GError                  **error);
+gboolean gd_miner_tracker_update_mtime (TrackerSparqlConnection  *connection,
+                                        gint64                    new_mtime,
+                                        gboolean                  resource_exists,
+                                        const gchar              *identifier,
+                                        const gchar              *resource,
+                                        GCancellable             *cancellable,
+                                        GError                  **error);
+
 G_END_DECLS
 
 #endif /* __GD_MINER_TRACKER_H__ */
diff --git a/src/miner/gd-zpj-miner.c b/src/miner/gd-zpj-miner.c
index 976e2c3..f9f598b 100644
--- a/src/miner/gd-zpj-miner.c
+++ b/src/miner/gd-zpj-miner.c
@@ -39,6 +39,8 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
   gchar *resource = NULL;
   gchar *date, *identifier;
   const gchar *class = NULL, *id, *name;
+  gboolean resource_exists, mtime_changed;
+  gint64 new_mtime;
 
   id = zpj_skydrive_entry_get_id (entry);
 
@@ -59,20 +61,36 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
   resource = gd_miner_tracker_sparql_connection_ensure_resource
     (job->connection,
      job->cancellable, error,
+     &resource_exists,
      job->datasource_urn, identifier,
      "nfo:RemoteDataObject", class, NULL);
 
   if (*error != NULL)
     goto out;
 
-  gd_miner_tracker_sparql_connection_set_triple
-    (job->connection, job->cancellable, error,
-     job->datasource_urn, resource,
-     "nie:dataSource", job->datasource_urn);
+  gd_miner_tracker_update_datasource (job->connection, job->datasource_urn,
+                                      resource_exists, identifier, resource,
+                                      job->cancellable, error);
 
   if (*error != NULL)
     goto out;
 
+  updated_time = zpj_skydrive_entry_get_updated_time (entry);
+  new_mtime = g_date_time_to_unix (updated_time);
+  mtime_changed = gd_miner_tracker_update_mtime (job->connection, new_mtime,
+                                                 resource_exists, identifier, resource,
+                                                 job->cancellable, error);
+
+  if (*error != NULL)
+    goto out;
+
+  /* avoid updating the DB if the entry already exists and has not
+   * been modified since our last run.
+   */
+  if (!mtime_changed)
+    goto out;
+
+  /* the resource changed - just set all the properties again */
   gd_miner_tracker_sparql_connection_insert_or_replace_triple
     (job->connection,
      job->cancellable, error,
@@ -91,6 +109,7 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
       parent_identifier = g_strconcat ("gd:collection:windows-live:skydrive:", parent_id, NULL);
       parent_resource_urn = gd_miner_tracker_sparql_connection_ensure_resource
         (job->connection, job->cancellable, error,
+         NULL,
          job->datasource_urn, parent_identifier,
          "nfo:RemoteDataObject", "nfo:DataContainer", NULL);
       g_free (parent_identifier);
@@ -170,18 +189,6 @@ account_miner_job_process_entry (GdAccountMinerJob *job,
   if (*error != NULL)
     goto out;
 
-  updated_time = zpj_skydrive_entry_get_updated_time (entry);
-  date = gd_iso8601_from_timestamp (g_date_time_to_unix (updated_time));
-  gd_miner_tracker_sparql_connection_insert_or_replace_triple
-    (job->connection,
-     job->cancellable, error,
-     job->datasource_urn, resource,
-     "nie:contentLastModified", date);
-  g_free (date);
-
-  if (*error != NULL)
-    goto out;
-
  out:
   g_free (resource);
   g_free (identifier);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]