[PATCH to gnumeric] handle unicode and codepages for Import/Exportof Excel files



 Hi, 

 Here is a patch against gnumeric-0.63 that adds support for full handling of
unicode and codepages for Import/Export of Excel files. Textual data from
Excel files will be converted to the current locale's encoding when importing
.xls files and will be converted from current locale's encoding to the
codepage corresponding to the language of the current locale when exporting to
.xls files.

 Additional functions from libc required for functioning: wcstombs and
mbstowcs (required by SVID 3), and also optionally iconv{,open,close} - by
undefining preprocessor symbol the code will compile and work without the
iconv* functions too (though actual conversion won't happen).

 It should work correctly for any encoding, including mutlibyte ones (not
tested). This patch was tested with russian, import of Excel 97 and Excel 95
files and export to Excel works flawless.

 Before this patch, import and export was working correctly only for
single-byte encodings, and was giving correct results only for latin1
locales. E.g. for russian all russian strings were becoming
unreadable in gnumeric after importing  .xls of any version and in MS
Excel after export of .xls files from gnumeric (i.e works fine without any
flaws for russian, and should work for any non-latin1 locale too, I hope).

 It seems this patch will help even people using latin languages due to
differences between CP1252 (used for latin1 in Windows) and ISO-8859-1 (used
for latin1 on Unix). And it seems this patch will allow gnumeric to
import/export latin1 chars under UTF8 locales too without problems (so that
characters with value > 127 will be imported/exported correctly).

 It would be nice if this patch was included in Gnumeric that will be shipped
with gnome-1.4.

 What do you think about it?

 Best regards,
  -Vlad
diff -ru -x po gnumeric-0.63~/plugins/excel/ms-excel-read.c gnumeric-0.63/plugins/excel/ms-excel-read.c
--- gnumeric-0.63~/plugins/excel/ms-excel-read.c	Sun Mar 11 17:17:05 2001
+++ gnumeric-0.63/plugins/excel/ms-excel-read.c	Mon Mar 12 10:23:16 2001
@@ -6,9 +6,11 @@
  *    Jody Goldberg (jgoldberg@home.com)
  *
  * (C) 1998, 1999, 2000 Michael Meeks, Jody Goldberg
+ * unicode and national language support (C) 2001 by Vlad Harchev <hvv@hippo.ru>
  **/
 
 #include <config.h>
+#include <locale.h>
 
 #include "ms-formula-read.h"
 #include "ms-excel-read.h"
@@ -55,6 +57,8 @@
 extern int ms_excel_object_debug;
 extern int gnumeric_debugging;
 
+static excel_iconv_t current_workbook_iconv = NULL;
+
 /* Forward references */
 static ExcelSheet *ms_excel_sheet_new       (ExcelWorkbook *wb,
 					     const char *name);
@@ -169,6 +173,53 @@
 	}
 }
 
+static char *
+get_chars (const char *ptr, guint length, gboolean high_byte)
+{
+	char* ans;
+	guint32 lp;	
+
+	if (high_byte) {
+		wchar_t* wc = g_new (wchar_t, length + 2);
+		int retlength;
+		ans = g_new (char, (length+2)*8);
+		
+		for (lp = 0; lp < length; lp++) {
+			guint16 c = MS_OLE_GET_GUINT16 (ptr);
+			ptr+=2;
+			wc[lp] = c;
+		}
+		
+		retlength = wcstombs(ans, wc, length);				
+		g_free(wc);
+		if (retlength == (size_t)-1)
+			retlength = 0;
+		else
+			ans[retlength] = 0;
+		g_realloc(ans, retlength + 2);
+	} else { 
+		size_t inbytes = length,
+			outbytes = (length+2)*8,
+			retlength;
+		char* inbuf = g_new(char, length), *outbufptr;
+		char* inbufptr = inbuf;
+		
+		ans = g_new (char, outbytes + 1);
+		outbufptr = ans;
+		for (lp = 0; lp < length; lp++) {
+			inbuf[lp] = MS_OLE_GET_GUINT8 (ptr);
+			ptr+=1;			
+		};
+		excel_iconv(current_workbook_iconv,&inbufptr,&inbytes,&outbufptr,&outbytes);
+		
+		retlength = outbufptr-ans;
+		ans[retlength] = 0;
+		g_realloc(ans,retlength+1);
+		g_free(inbuf);
+	};
+	return ans;
+}
+
 /**
  *  This function takes a length argument as Biff V7 has a byte length
  * ( seemingly ).
@@ -185,7 +236,6 @@
 	guint32 byte_len;
 	gboolean header;
 	gboolean high_byte;
-	static gboolean high_byte_warned = FALSE;
 	gboolean ext_str;
 	gboolean rich_str;
 
@@ -207,8 +257,6 @@
 	}
 #endif
 
-	ans = (char *) g_new (char, length + 2);
-
 	header = biff_string_get_flags (pos,
 					&high_byte,
 					&ext_str,
@@ -219,12 +267,6 @@
 	} else
 		ptr = pos;
 
-	/* A few friendly warnings */
-	if (high_byte && !high_byte_warned) {
-		printf ("FIXME: unicode support unimplemented: truncating\n");
-		high_byte_warned = TRUE;
-	}
-
 	{
 		guint32 pre_len, end_len;
 
@@ -242,51 +284,16 @@
 	}
 #endif
 
-	for (lp = 0; lp < length; lp++) {
-		guint16 c;
-
-		if (high_byte) {
-			c = MS_OLE_GET_GUINT16 (ptr);
-			ptr+=2;
-			ans[lp] = (char)c;
-			(*byte_length) += 2;
-		} else {
-			c = MS_OLE_GET_GUINT8 (ptr);
-			ptr+=1;
-			ans[lp] = (char)c;
-			(*byte_length) += 1;
-		}
-	}
-	if (lp > 0)
-		ans[lp] = 0;
-	else
+	if (!length) {
+		ans = g_new (char, 2);
 		g_warning ("Warning unterminated string floating");
+	} else {	
+		(*byte_length) += (high_byte ? 2 : 1)*length;
+		ans = get_chars(ptr, length, high_byte);
+	};
 	return ans;
 }
 
-static char *
-get_utf8_chars (const char *ptr, guint len, gboolean high_byte)
-{
-	int    i;
-	char *ans = g_new (char, len + 1);
-
-	for (i = 0; i < len; i++) {
-		guint16 c;
-
-		if (high_byte) {
-			c = MS_OLE_GET_GUINT16 (ptr);
-			ptr+=2;
-			ans [i] = (char)c;
-		} else {
-			c = MS_OLE_GET_GUINT8 (ptr);
-			ptr+=1;
-			ans [i] = (char)c;
-		}
-	}
-	ans [i] = '\0';
-
-	return ans;
-}
 
 static guint32
 sst_bound_check (BiffQuery *q, guint32 offset)
@@ -366,7 +373,7 @@
 		g_assert (get_len >= 0);
 
 		/* FIXME: split this simple bit out of here, it makes more sense damnit */
-		str = get_utf8_chars (q->data + new_offset + pre_len, get_len, high_byte);
+		str = get_chars (q->data + new_offset + pre_len, get_len, high_byte);
 		new_offset += pre_len + get_len * (high_byte?2:1);
 
 		if (!(*output))
@@ -587,12 +594,23 @@
 		ans->hidden = MS_BIFF_H_VISIBLE;
 		break;
 	}
+#if 0
 	if (ver == MS_BIFF_V8) {
-		int slen = MS_OLE_GET_GUINT16 (q->data + 6);
+		int slen = MS_OLE_GET_GUINT16 (q->data + 6);		
 		ans->name = biff_get_text (q->data + 8, slen, NULL);
-	} else {
+	} else 
+#endif
+	{ 
+		/* 
+		 * there are test files produced by non-latin1 Excel (e.g. 
+		 * russian version) that prove that branch above is 
+		 * incorrect. It seems test files that insured author of branch
+		 * above were produced by latin1 version of Excel - 
+		 * in that case q->data[7] is always 0, so it can be attributed
+		 * to length of sheet name or to the string header.
+		 * 			- Vlad Harchev <hvv@hippo.ru>
+		 */
 		int slen = MS_OLE_GET_GUINT8 (q->data + 6);
-
 		ans->name = biff_get_text (q->data + 7, slen, NULL);
 	}
 
@@ -4172,6 +4190,8 @@
 			/* MW: And on Excel seems to drive the display
 			   of currency amounts.  */
 			const guint16 codepage = MS_OLE_GET_GUINT16 (q->data);
+			excel_iconv_close(current_workbook_iconv);
+			current_workbook_iconv = excel_iconv_open_for_import(codepage);
 #ifndef NO_DEBUG_EXCEL
 			if (ms_excel_read_debug > 0) {
 				switch(codepage) {
@@ -4319,7 +4339,7 @@
 		fflush (stdout);
 	}
 #endif
-
+	excel_iconv_close(current_workbook_iconv);
 	if (wb) {
 		/* Cleanup */
 		ms_excel_workbook_destroy (wb);
diff -ru -x po gnumeric-0.63~/plugins/excel/ms-excel-util.c gnumeric-0.63/plugins/excel/ms-excel-util.c
--- gnumeric-0.63~/plugins/excel/ms-excel-util.c	Tue Oct 31 20:21:05 2000
+++ gnumeric-0.63/plugins/excel/ms-excel-util.c	Mon Mar 12 10:40:20 2001
@@ -5,6 +5,7 @@
  *    Jon K Hellan (hellan@acm.org)
  *
  * (C) 1999, 2000 Jon K Hellan
+ * excel_iconv* family of functions (C) 2001 by Vlad Harchev <hvv@hippo.ru>
  **/
 
 #include "config.h"
@@ -14,7 +15,17 @@
 #include "ms-excel-util.h"
 
 #include <stdio.h>
+#include <string.h>
 
+#ifdef HAVE_LANGINFO_H
+#include <langinfo.h>
+#endif
+
+/* comment out this if you don't have iconv available */
+#define HAVE_ICONV
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#endif
 extern int ms_excel_read_debug;
 
 /*
@@ -318,4 +329,160 @@
 
 	/* Use a rough heuristic for unknown fonts. */
 	return .5625 * size_pts;
+}
+
+
+
+static char*
+get_locale_charset_name()
+{
+#ifndef HAVE_ICONV
+	return "";
+#else
+	static char* charset = NULL;
+
+	if (charset)
+		return charset;
+		
+#ifdef _NL_CTYPE_CODESET_NAME
+	charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
+#elif defined(CODESET)
+	charset = nl_langinfo (CODESET);
+#elif
+	{
+		char* locale = setlocale(LC_CTYPE,NULL);
+		char* tmp = strchr(locale,'.');
+		if (tmp)
+			charset = tmp+1;
+	}
+#endif  
+	if (!charset)
+		charset = "ISO-8859-1";
+	charset = g_strdup(charset);
+	return charset;
+#endif
+}
+
+typedef struct
+{
+	const char** keys;/*NULL-terminated list*/
+	int value;
+} s_hash_entry;
+
+static const char* cyr_locales[] = 
+{
+	"russian", "ru", "be", "uk", NULL
+};
+
+static const s_hash_entry win_codepages[]=
+{
+	{ cyr_locales , 1251 },
+	{ NULL }
+};
+
+guint
+excel_iconv_win_codepage()
+{
+	char* lang = NULL;
+	static guint codepage = 0;
+	char* env_lang;
+	
+	if (codepage)
+		return codepage;
+	
+	/* just for flexibility */
+	if (env_lang = getenv("WINDOWS_LANGUAGE"))
+		lang = env_lang;
+	else {
+		char* locale = setlocale(LC_CTYPE,NULL);
+		char* lang_sep = strchr(locale,'_');
+		if (lang_sep)
+			lang = g_strndup(locale,lang_sep-locale);
+		else
+			lang = locale;
+	}
+	lang = g_strdup(lang);	
+	/*now search for that language in 'cyr_locales'*/
+	{
+		const s_hash_entry* entry;
+		for(entry = win_codepages; entry->keys; ++entry) {
+			const char** key;
+			for(key=entry->keys; *key; ++key) {
+				if (!g_strcasecmp(*key,lang)) {
+					codepage = entry->value;
+					return codepage;
+				};
+			};			
+		}		
+	}
+	codepage = 1252; /*default one*/
+	return codepage;
+}
+
+/*these two will figure out which charset names to use*/
+excel_iconv_t
+excel_iconv_open_for_import(guint codepage)
+{
+#ifndef HAVE_ICONV
+	return (excel_iconv_t)(-1);
+#else
+	char* src_charset;
+	iconv_t iconv_handle;
+	
+	src_charset = g_strdup_printf("CP%d",codepage);
+	iconv_handle = iconv_open(get_locale_charset_name(), src_charset);
+	g_free(src_charset);
+	return 	iconv_handle;
+#endif
+}
+
+excel_iconv_t
+excel_iconv_open_for_export()
+{
+#ifndef HAVE_ICONV
+	return (excel_iconv_t)(-1);
+#else
+	static char* dest_charset = NULL;
+	iconv_t iconv_handle;
+	
+	if (!dest_charset)
+		dest_charset = g_strdup_printf("CP%d",excel_iconv_win_codepage());
+	iconv_handle = iconv_open(dest_charset, get_locale_charset_name());
+	return 	iconv_handle;
+#endif
+};
+
+void
+excel_iconv_close(excel_iconv_t handle)
+{
+#ifdef HAVE_ICONV
+	if (handle && handle != (excel_iconv_t)(-1))
+		iconv_close(handle);
+#endif
+}
+
+size_t
+excel_iconv(excel_iconv_t handle,char ** const inbuf, size_t *inbytesleft,
+				char **outbuf, size_t *outbytesleft)
+{
+#ifndef HAVE_ICONV
+	guint tocopy = *inbytesleft <= *outbytesleft ? *inbytesleft : *outbytesleft;
+	memcpy(*outbuf,*inbuf,tocopy);
+	*outbuf += tocopy; 
+	*inbuf += tocopy;
+	*outbytesleft -= tocopy;
+	*inbytesleft -= tocopy;
+#else
+	while (*inbytesleft){
+		if (handle && handle!=(iconv_t)(-1))
+			iconv((iconv_t)handle, inbuf, inbytesleft,
+					outbuf, outbytesleft);
+		if (!*inbytesleft || !*outbytesleft)
+			return 0;
+		/*got invalid seq - so replace it with original character*/
+		**outbuf = **inbuf; (*outbuf)++; (*outbytesleft)--;
+		(*inbuf)++; (*inbytesleft)--;
+	};
+#endif
+	return 0;
 }
diff -ru -x po gnumeric-0.63~/plugins/excel/ms-excel-util.h gnumeric-0.63/plugins/excel/ms-excel-util.h
--- gnumeric-0.63~/plugins/excel/ms-excel-util.h	Fri Mar 24 19:36:02 2000
+++ gnumeric-0.63/plugins/excel/ms-excel-util.h	Mon Mar 12 02:10:47 2001
@@ -10,6 +10,7 @@
 #define GNUMERIC_MS_EXCEL_UTIL_H
 
 #include <glib.h>
+#include <stdlib.h>
 #include "sheet.h"
 
 typedef struct _TwoWayTable   TwoWayTable;
@@ -51,5 +52,38 @@
 double
 lookup_font_base_char_width_new (char const * const name, double size_pts,
 				 gboolean const is_default);
+
+
+/* a group of iconv_* - like functions, with safe fallbacks if iconv is 
+  unavailable. Sorry for stupid prefix - Vlad Harchev <hvv@hippo.ru> */
+typedef void* excel_iconv_t;/*can't be NULL or (-1) */
+
+/* 
+   this returns code of the codepage that should be used when exporting
+   .xls files (it's guessed by looking at language name). Fallback is 1252.
+*/
+guint
+excel_iconv_win_codepage();
+
+/*these two will figure out which charset names to use*/
+excel_iconv_t
+excel_iconv_open_for_import(guint codepage);
+
+excel_iconv_t
+excel_iconv_open_for_export();
+
+void
+excel_iconv_close(excel_iconv_t handle);
+/*if fails (or if compiled without support for iconv), it will  
+  copy the input string to output and pretend that all worked fine.
+  If some char is non-convertable, it will replace that char with "?".
+  
+  It's required that inbytesleft <= outbytesleft (so that fallback will be
+  able to work). As for now, return value is not meaningfull at all - 0 is 
+  always returned.
+*/
+size_t 
+excel_iconv(excel_iconv_t handle,char ** const inbuf, size_t *inbytesleft,
+				char **outbuf, size_t *outbytesleft); 
 
 #endif
diff -ru -x po gnumeric-0.63~/plugins/excel/ms-excel-write.c gnumeric-0.63/plugins/excel/ms-excel-write.c
--- gnumeric-0.63~/plugins/excel/ms-excel-write.c	Sun Mar 11 17:17:05 2001
+++ gnumeric-0.63/plugins/excel/ms-excel-write.c	Mon Mar 12 10:18:59 2001
@@ -57,6 +57,7 @@
 #include "ms-excel-xf.h"
 #include "ms-formula-write.h"
 
+static excel_iconv_t current_workbook_iconv = NULL;
 /**
  *  This function writes simple strings...
  *  FIXME: see S59D47.HTM for full description
@@ -112,13 +113,31 @@
 	}
 	ms_biff_put_var_write (bp, data, off);
 
-/* You got it coming */
-	for (lp = 0; lp < len; lp++) {
-		MS_OLE_SET_GUINT16 (data, txt[lp]);
-		ms_biff_put_var_write (bp, data, unicode?2:1);
-	}
-	return off + len*(unicode?2:1);
-
+	if (unicode) {
+		wchar_t* wcbuf = g_new(wchar_t,len);
+		len = mbstowcs(wcbuf,txt,len);
+		for (lp = 0; lp < len; lp++) {
+			MS_OLE_SET_GUINT16 (data, wcbuf[lp]);
+			ms_biff_put_var_write (bp, data, 2);
+		}		
+		g_free(wcbuf);
+		lp *= 2;
+	} else {
+		size_t inbufleft = len, outbufleft = len*8;
+		char* mbbuf = g_new(char, outbufleft);
+		char* inbufptr = txt, *outbufptr = mbbuf;
+		int retlen;
+
+		excel_iconv(current_workbook_iconv, &inbufptr, &inbufleft, 
+			&outbufptr, &outbufleft);
+		retlen = outbufptr - mbbuf;
+		for (lp = 0; lp < retlen; lp++) {
+			MS_OLE_SET_GUINT8 (data, mbbuf[lp]);
+			ms_biff_put_var_write (bp, data, 1);			
+		}
+		g_free(mbbuf);
+	};
+	return off + lp;
 	/* An attempt at efficiency */
 /*	chunks = len/BLK_LEN;
 	pos    = 0;
@@ -367,7 +386,7 @@
 
 	/* See: S59D66.HTM */
 	data = ms_biff_put_len_next (bp, BIFF_CODEPAGE, 2);
-	MS_OLE_SET_GUINT16 (data, 0x04e4); /* ANSI */
+	MS_OLE_SET_GUINT16 (data, excel_iconv_win_codepage());
 	ms_biff_put_commit (bp);
 
 	if (ver >= MS_BIFF_V8) { /* See S59D78.HTM */
@@ -2974,7 +2993,7 @@
 	MS_OLE_SET_GUINT32 (data + 20, 0x3fe00000);
 	MS_OLE_SET_GUINT32 (data + 24, 0x00000000);
 	MS_OLE_SET_GUINT32 (data + 28, 0x3fe00000);
-	MS_OLE_SET_GUINT16 (data + 32, 0x04e4);
+	MS_OLE_SET_GUINT16 (data + 32, excel_iconv_win_codepage());
 	ms_biff_put_commit (bp);
 
 	write_externsheets (bp, sheet->wb, sheet);
@@ -3436,6 +3455,7 @@
 	ExcelSheet *s  = 0;
 	int        lp;
 
+	current_workbook_iconv = excel_iconv_open_for_export();
 	/* Workbook */
 	wb->streamPos = biff_bof_write (bp, ver, MS_BIFF_TYPE_Workbook);
 
@@ -3473,6 +3493,8 @@
 					    s->streamPos);
 	}
 	/* End Finalised workbook */
+	excel_iconv_close (current_workbook_iconv);
+	current_workbook_iconv = NULL;
 }
 
 /*


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]