Hi there, I've been trying to un-reinvent at least one wheel in yelp's man page handling and was wondering if someone could have a look at the result. My idea was that at the moment (mainly because the language guessing is wrong?), re-encoding man pages is broken. For example, on debian with yelp from git, type something like LC_ALL=de_DE yelp man:man and look at the umlauts. Then I noticed that man -R utf-8 man worked fine. "Ahah!" I said. "Let's just use man -R". Of course, this isn't any good because not all computers that can run yelp have man-db installed :-( So here's a patch that works out whether it is installed and, if so, uses it instead. Comments please! Rupert
From 44521a8b5ffa2f2bc3970aec826627e38fafdf85 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Sun, 12 Dec 2010 01:53:01 +0000 Subject: [PATCH] Try to use man-db to recode man pages to utf8. This patch adds a test (calling "man -R utf8 man") to see whether there's a version of man installed that'll do recoding. If so, we use that for converting to utf8 instead of our builtin glib version. This patch also changes the built-in version to use GCharsetConverter so we can chain it with the magic decompressor. --- libyelp/yelp-man-document.c | 103 ++--------------- libyelp/yelp-man-parser.c | 259 +++++++++++++++++++++++++++++++++++-------- libyelp/yelp-man-parser.h | 6 +- 3 files changed, 231 insertions(+), 137 deletions(-) diff --git a/libyelp/yelp-man-document.c b/libyelp/yelp-man-document.c index 14ac8cd..d08f541 100644 --- a/libyelp/yelp-man-document.c +++ b/libyelp/yelp-man-document.c @@ -63,62 +63,6 @@ struct _YelpManDocumentPrivate { guint error; }; -typedef struct _YelpLangEncodings YelpLangEncodings; -struct _YelpLangEncodings { - gchar *language; - gchar *encoding; -}; -/* http://www.w3.org/International/O-charset-lang.html */ -static const YelpLangEncodings langmap[] = { - { "C", "ISO-8859-1" }, - { "af", "ISO-8859-1" }, - { "ar", "ISO-8859-6" }, - { "bg", "ISO-8859-5" }, - { "be", "ISO-8859-5" }, - { "ca", "ISO-8859-1" }, - { "cs", "ISO-8859-2" }, - { "da", "ISO-8859-1" }, - { "de", "ISO-8859-1" }, - { "el", "ISO-8859-7" }, - { "en", "ISO-8859-1" }, - { "eo", "ISO-8859-3" }, - { "es", "ISO-8859-1" }, - { "et", "ISO-8859-15" }, - { "eu", "ISO-8859-1" }, - { "fi", "ISO-8859-1" }, - { "fo", "ISO-8859-1" }, - { "fr", "ISO-8859-1" }, - { "ga", "ISO-8859-1" }, - { "gd", "ISO-8859-1" }, - { "gl", "ISO-8859-1" }, - { "hu", "ISO-8859-2" }, - { "id", "ISO-8859-1" }, /* is this right */ - { "mt", "ISO-8859-3" }, - { "is", "ISO-8859-1" }, - { "it", "ISO-8859-1" }, - { "iw", "ISO-8859-8" }, - { "ja", "EUC-JP" }, - { "ko", "EUC-KR" }, - { "lt", "ISO-8859-13" }, - { "lv", "ISO-8859-13" }, - { "mk", "ISO-8859-5" }, - { "mt", "ISO-8859-3" }, - { "no", "ISO-8859-1" }, - { "pl", "ISO-8859-2" }, - { "pt_BR", "ISO-8859-1" }, - { "ro", "ISO-8859-2" }, - { "ru", "KOI8-R" }, - { "sl", "ISO-8859-2" }, - { "sr", "ISO-8859-2" }, /* Latin, not cyrillic */ - { "sk", "ISO-8859-2" }, - { "sv", "ISO-8859-1" }, - { "tr", "ISO-8859-9" }, - { "uk", "ISO-8859-5" }, - { "zh_CN", "BIG5" }, - { "zh_TW", "BIG5" }, - { NULL, NULL }, -}; - static void yelp_man_document_class_init (YelpManDocumentClass *klass); static void yelp_man_document_init (YelpManDocument *man); static void yelp_man_document_dispose (GObject *object); @@ -390,12 +334,11 @@ man_document_process (YelpManDocument *man) { YelpManDocumentPrivate *priv = GET_PRIV (man); GFile *file = NULL; - gchar *filepath = NULL; + gchar *filepath; GError *error; gint params_i = 0; gchar **params = NULL; YelpManParser *parser; - const gchar *language, *encoding; file = yelp_uri_get_file (priv->uri); if (file == NULL) { @@ -407,44 +350,22 @@ man_document_process (YelpManDocument *man) } filepath = g_file_get_path (file); - g_object_unref (file); - if (!g_file_test (filepath, G_FILE_TEST_IS_REGULAR)) { - error = g_error_new (YELP_ERROR, YELP_ERROR_NOT_FOUND, - _("The file ‘%s’ does not exist."), - filepath); - yelp_document_error_pending ((YelpDocument *) man, error); - g_error_free (error); - goto done; - } - - /* FIXME: get the language */ - language = "C"; - - /* default encoding if the language doesn't match below */ - encoding = g_getenv("MAN_ENCODING"); - if (encoding == NULL) - encoding = "ISO-8859-1"; - - if (language != NULL) { - gint i; - for (i = 0; langmap[i].language != NULL; i++) { - if (g_str_equal (language, langmap[i].language)) { - encoding = langmap[i].encoding; - break; - } - } - } parser = yelp_man_parser_new (); - priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, encoding); + priv->xmldoc = yelp_man_parser_parse_file (parser, file, + filepath, &error); yelp_man_parser_free (parser); if (priv->xmldoc == NULL) { - error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, - _("The file ‘%s’ could not be parsed because it is" - " not a well-formed man page."), - filepath); - yelp_document_error_pending ((YelpDocument *) man, error); + if (!error) { + error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, + _("The file ‘%s’ could not be parsed because it is" + " not a well-formed man page."), + filepath); + } + yelp_document_error_pending ((YelpDocument *) man, error); + g_error_free (error); + goto done; } g_mutex_lock (priv->mutex); diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index 49efe9f..f04cffa 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -29,6 +29,7 @@ #include <libxml/tree.h> #include <string.h> +#include "yelp-error.h" #include "yelp-man-parser.h" #include "yelp-magic-decompressor.h" @@ -83,6 +84,63 @@ struct _YelpManParser { GSList *nodeStack; }; +typedef struct _YelpLangEncodings YelpLangEncodings; +struct _YelpLangEncodings { + gchar *language; + gchar *encoding; +}; +/* http://www.w3.org/International/O-charset-lang.html */ +static const YelpLangEncodings langmap[] = { + { "C", "ISO-8859-1" }, + { "af", "ISO-8859-1" }, + { "ar", "ISO-8859-6" }, + { "bg", "ISO-8859-5" }, + { "be", "ISO-8859-5" }, + { "ca", "ISO-8859-1" }, + { "cs", "ISO-8859-2" }, + { "da", "ISO-8859-1" }, + { "de", "ISO-8859-1" }, + { "el", "ISO-8859-7" }, + { "en", "ISO-8859-1" }, + { "eo", "ISO-8859-3" }, + { "es", "ISO-8859-1" }, + { "et", "ISO-8859-15" }, + { "eu", "ISO-8859-1" }, + { "fi", "ISO-8859-1" }, + { "fo", "ISO-8859-1" }, + { "fr", "ISO-8859-1" }, + { "ga", "ISO-8859-1" }, + { "gd", "ISO-8859-1" }, + { "gl", "ISO-8859-1" }, + { "hu", "ISO-8859-2" }, + { "id", "ISO-8859-1" }, /* is this right */ + { "mt", "ISO-8859-3" }, + { "is", "ISO-8859-1" }, + { "it", "ISO-8859-1" }, + { "iw", "ISO-8859-8" }, + { "ja", "EUC-JP" }, + { "ko", "EUC-KR" }, + { "lt", "ISO-8859-13" }, + { "lv", "ISO-8859-13" }, + { "mk", "ISO-8859-5" }, + { "mt", "ISO-8859-3" }, + { "no", "ISO-8859-1" }, + { "pl", "ISO-8859-2" }, + { "pt_BR", "ISO-8859-1" }, + { "ro", "ISO-8859-2" }, + { "ru", "KOI8-R" }, + { "sl", "ISO-8859-2" }, + { "sr", "ISO-8859-2" }, /* Latin, not cyrillic */ + { "sk", "ISO-8859-2" }, + { "sv", "ISO-8859-1" }, + { "tr", "ISO-8859-9" }, + { "uk", "ISO-8859-5" }, + { "zh_CN", "BIG5" }, + { "zh_TW", "BIG5" }, + { NULL, NULL }, +}; + + YelpManParser * yelp_man_parser_new (void) { @@ -91,61 +149,174 @@ yelp_man_parser_new (void) return parser; } -xmlDocPtr -yelp_man_parser_parse_file (YelpManParser *parser, - gchar *file, - const gchar *encoding) +/* Checks (caching the answer) whether a man-db compatible man */ +/* implementation is installed. */ +static gboolean +man_db_installed_p () +{ + static gint cache = -1; + if (cache > -1) return (cache == 0); + + gchar* argv[] = { "man", "-R", "utf-8", "man", NULL }; + + /* If we can run "man -R utf-8 man" successfully, then we decide + * that man-db (or a plausible lookalike) must be installed. */ + g_spawn_sync (NULL, argv, NULL, + G_SPAWN_STDOUT_TO_DEV_NULL | G_SPAWN_SEARCH_PATH | + G_SPAWN_STDERR_TO_DEV_NULL, + NULL, NULL, NULL, NULL, &cache, NULL); + + /* If man returns -n, set it to 1: we only care that it was != 0 */ + if (cache < 0) cache = 1; + + return (cache == 0); +} + +static GInputStream* +man_db_utf8_recode (const gchar* path, GError **error) +{ + gboolean ret; + gint stdout; + const gchar* argv[] = { "man", "-R", "utf-8", NULL, NULL }; + GError *err = NULL, *yelp_err = NULL; + + /* I don't have to worry about the lifetime of path, since + g_spawn_async_with_pipes works by calling fork() then execv(). + Fork copies across all pages of memory into my new address + space, so path doesn't need to survive past the call below. + */ + argv[3] = path; + + ret = g_spawn_async_with_pipes (NULL, (gchar**)argv, NULL, + G_SPAWN_SEARCH_PATH, + NULL, NULL, NULL, NULL, &stdout, + NULL, &err); + if (!ret) { + *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN, + err->message); + g_error_free (err); + return NULL; + } + + return (GInputStream*) g_unix_input_stream_new (stdout, TRUE); +} + +/* + This function is responsible for returning a utf-8 encoded stream + for a man file. + + If we're lucky and man-db is installed, we can call 'man -R' to do + the work for us (rather better than our heuristics, maybe). + + Otherwise, we fall back on rather less clever methods. + + If something goes wrong, we return NULL and set error to be a + YelpError describing the problem. +*/ +static GInputStream* +get_man_utf8 (GFile *file, const gchar *path, GError **error) { - GFile *gfile; - GConverter *converter; + GConverter *decompressor, *charconv; GFileInputStream *file_stream; - GInputStream *stream; + GInputStream *ret, *tmp; + GError *err = NULL; + const gchar *language, *encoding; + + if (man_db_installed_p ()) + return man_db_utf8_recode (path, error); + + /* Bad news: we've got to do it ourselves. */ + + /* FIXME: get the language */ + language = "C"; + + /* default encoding if the language doesn't match below */ + encoding = g_getenv("MAN_ENCODING"); + if (encoding == NULL) + encoding = "ISO-8859-1"; + + if (language != NULL) { + gint i; + for (i = 0; langmap[i].language != NULL; i++) { + if (g_str_equal (language, langmap[i].language)) { + encoding = langmap[i].encoding; + break; + } + } + } + + file = g_file_new_for_path (path); + file_stream = g_file_read (file, NULL, &err); + if (!file_stream) { + *error = g_error_new (YELP_ERROR, YELP_ERROR_NOT_FOUND, + err->message); + g_error_free (err); + g_object_unref (file); + return NULL; + } + + /* Chain converters if necessary with g_converter_input_stream_new + (example in gio/tests/filter-cat.c) */ + decompressor = (GConverter *) yelp_magic_decompressor_new (); + ret = g_converter_input_stream_new ((GInputStream *) file_stream, + decompressor); + g_object_unref (decompressor); + + if (!g_str_equal (encoding, "UTF-8")) { + charconv = + (GConverter *) g_charset_converter_new ("UTF-8", encoding, + &err); + if (!charconv) { + *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN, + err->message); + g_error_free (err); + g_object_unref (file); + g_object_unref (ret); + return NULL; + } + + tmp = ret; + ret = + (GInputStream *) g_converter_input_stream_new (ret, + charconv); + g_object_unref (charconv); + g_object_unref (tmp); + } + + return ret; +} + +xmlDocPtr +yelp_man_parser_parse_file (YelpManParser *parser, + GFile *file, + const gchar *path, + GError **error) +{ + GInputStream *recoded_stream; gchar *line; gsize len; - gfile = g_file_new_for_path (file); - file_stream = g_file_read (gfile, NULL, NULL); - converter = (GConverter *) yelp_magic_decompressor_new (); - stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter); - parser->stream = g_data_input_stream_new (stream); + recoded_stream = get_man_utf8 (file, path, error); + if (!recoded_stream) return NULL; + + parser->stream = g_data_input_stream_new (recoded_stream); parser->doc = xmlNewDoc (BAD_CAST "1.0"); parser->ins = xmlNewNode (NULL, BAD_CAST "Man"); - xmlDocSetRootElement (parser->doc, parser->ins); + xmlDocSetRootElement (parser->doc, parser->ins); parser->make_links = TRUE; - while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) { - /* convert this line from the encoding indicated to UTF-8 */ - if (!g_str_equal (encoding, "UTF-8")) { - GError *converr = NULL; - gchar *new_buffer = NULL; - gsize bytes_written = 0; - - /* We are making the - * assumption that there are no partial characters at the end of this - * string, and therefore can use calls like g_convert() which do not - * preserve state - someone tell me if I'm wrong here */ - new_buffer = g_convert (parser->buffer, parser->length, "UTF-8", - encoding, NULL, &bytes_written, &converr); - if (converr != NULL) { - g_print ("Error occurred converting %s to UTF-8: %s\n", - encoding, converr->message); - g_error_free (converr); - break; - } else if (parser->buffer == NULL) { - g_print ("parser->buffer == NULL\n"); - break; - } + while (1) { + parser->buffer = + g_data_input_stream_read_line (parser->stream, + &(parser->length), + NULL, NULL); + if (parser->buffer == NULL) break; - g_free (parser->buffer); - parser->buffer = new_buffer; - parser->length = bytes_written; - } + parser_parse_line (parser); - parser_parse_line (parser); - - g_free (parser->buffer); + g_free (parser->buffer); } g_object_unref (parser->stream); diff --git a/libyelp/yelp-man-parser.h b/libyelp/yelp-man-parser.h index 1901f1b..369ad29 100644 --- a/libyelp/yelp-man-parser.h +++ b/libyelp/yelp-man-parser.h @@ -24,14 +24,16 @@ #define __YELP_MAN_PARSER_H__ #include <glib.h> +#include <gio/gio.h> #include <libxml/tree.h> typedef struct _YelpManParser YelpManParser; YelpManParser * yelp_man_parser_new (void); xmlDocPtr yelp_man_parser_parse_file (YelpManParser *parser, - gchar *file, - const gchar *encoding); + GFile *file, + const gchar *path, + GError **error); void yelp_man_parser_free (YelpManParser *parser); #endif /* __YELP_MAN_PARSER_H__ */ -- 1.7.2.3
Attachment:
pgp6PHU3YLuJr.pgp
Description: PGP signature