diff options
author | Arnavion <arnavion@gmail.com> | 2015-01-18 02:10:04 -0800 |
---|---|---|
committer | Arnavion <arnavion@gmail.com> | 2015-01-18 02:10:04 -0800 |
commit | 5749c53484369515c4e6df4a4730b1f34fab82b4 (patch) | |
tree | 8ba88feef1e6086981f8add539572d6cd0b142c2 | |
parent | 5569205d1578f46a4ba4a2b23d8e48a933acd6ce (diff) |
Server line text-encoding-related fixes.
- Handle server lines that contain sequences which are invalid in the server encoding. Previously, these would cause the whole line to be interpreted in ISO-8859-1, but now they're simply replaced with an appropriate replacement character. - Removed prefs.utf8_locale. - Change default server encoding from system locale to UTF-8. - Always populate server->encoding with a non-null value - UTF-8. Fixes #1198
-rw-r--r-- | src/common/dcc.c | 29 | ||||
-rw-r--r-- | src/common/hexchat.c | 4 | ||||
-rw-r--r-- | src/common/hexchat.h | 1 | ||||
-rw-r--r-- | src/common/plugin.c | 12 | ||||
-rw-r--r-- | src/common/server.c | 148 | ||||
-rw-r--r-- | src/common/text.c | 242 | ||||
-rw-r--r-- | src/common/text.h | 3 |
7 files changed, 121 insertions, 318 deletions
diff --git a/src/common/dcc.c b/src/common/dcc.c index b48bd3a8..65f52322 100644 --- a/src/common/dcc.c +++ b/src/common/dcc.c @@ -505,29 +505,11 @@ dcc_chat_line (struct DCC *dcc, char *line) session *sess; char *word[PDIWORDS]; char *po; - char *utf; - char *conv; int ret, i; - gssize len; - gsize utf_len; char portbuf[32]; message_tags_data no_tags = MESSAGE_TAGS_DATA_INIT; - len = strlen (line); - - if (dcc->serv->encoding == NULL) /* system */ - utf = g_locale_to_utf8 (line, len, NULL, &utf_len, NULL); - else - utf = g_convert (line, len, "UTF-8", dcc->serv->encoding, 0, &utf_len, 0); - - if (utf) - { - line = utf; - len = utf_len; - } - - /* we really need valid UTF-8 now */ - conv = text_validate (&line, &len); + line = text_invalid_encoding_to_utf8 (line, -1, dcc->serv->encoding, NULL); sess = find_dialog (dcc->serv, dcc->nick); if (!sess) @@ -548,16 +530,14 @@ dcc_chat_line (struct DCC *dcc, char *line) /* did the plugin close it? */ if (!g_slist_find (dcc_list, dcc)) { - g_free (utf); - g_free (conv); + g_free (line); return 1; } /* did the plugin eat the event? */ if (ret) { - g_free (utf); - g_free (conv); + g_free (line); return 0; } @@ -574,8 +554,7 @@ dcc_chat_line (struct DCC *dcc, char *line) { inbound_privmsg (dcc->serv, dcc->nick, "", line, FALSE, &no_tags); } - g_free (utf); - g_free (conv); + g_free (line); return 0; } diff --git a/src/common/hexchat.c b/src/common/hexchat.c index 7d8e462f..1428039a 100644 --- a/src/common/hexchat.c +++ b/src/common/hexchat.c @@ -757,7 +757,6 @@ static void xchat_init (void) { char buf[3068]; - const char *cs = NULL; #ifdef WIN32 WSADATA wsadata; @@ -795,9 +794,6 @@ xchat_init (void) #endif #endif - if (g_get_charset (&cs)) - prefs.utf8_locale = TRUE; - load_text_events (); sound_load (); notify_load (); diff --git a/src/common/hexchat.h b/src/common/hexchat.h index 1d4e4f1f..808cb90e 100644 --- a/src/common/hexchat.h +++ b/src/common/hexchat.h @@ -317,7 +317,6 @@ struct hexchatprefs guint32 dcc_ip; unsigned int wait_on_exit; /* wait for logs to be flushed to disk IF we're connected */ - unsigned int utf8_locale; /* Tells us if we need to save, only when they've been edited. This is so that we continue using internal defaults (which can diff --git a/src/common/plugin.c b/src/common/plugin.c index b0eef41d..5ef20de8 100644 --- a/src/common/plugin.c +++ b/src/common/plugin.c @@ -970,9 +970,7 @@ hexchat_printf (hexchat_plugin *ph, const char *format, ...) void hexchat_command (hexchat_plugin *ph, const char *command) { - char *command_nonconst; - char *conv; - gssize len = -1; + char *command_utf8; if (!is_session (ph->context)) { @@ -981,11 +979,9 @@ hexchat_command (hexchat_plugin *ph, const char *command) } /* scripts/plugins continue to send non-UTF8... *sigh* */ - command_nonconst = g_strdup (command); - conv = text_validate (&command_nonconst, &len); - handle_command (ph->context, command_nonconst, FALSE); - g_free (conv); - g_free (command_nonconst); + command_utf8 = text_invalid_encoding_to_utf8 (command, -1, "UTF-8", NULL); + handle_command (ph->context, command_utf8, FALSE); + g_free (command_utf8); } void diff --git a/src/common/server.c b/src/common/server.c index 75192dfa..f9ca809e 100644 --- a/src/common/server.c +++ b/src/common/server.c @@ -89,48 +89,18 @@ int tcp_send_real (void *ssl, int sok, char *encoding, char *buf, int len) { int ret; - char *locale; - gsize loc_len; - if (encoding == NULL) /* system */ - { - locale = NULL; - if (!prefs.utf8_locale) - { - const gchar *charset; - - g_get_charset (&charset); - locale = g_convert_with_fallback (buf, len, charset, "UTF-8", "?", 0, &loc_len, 0); - } - } - else - { - locale = g_convert_with_fallback (buf, len, encoding, "UTF-8", "?", 0, &loc_len, 0); - } - - if (locale) - { - len = loc_len; -#ifdef USE_OPENSSL - if (!ssl) - ret = send (sok, locale, len, 0); - else - ret = _SSL_send (ssl, locale, len); -#else - ret = send (sok, locale, len, 0); -#endif - g_free (locale); - } else - { + gsize buf_encoded_len; + gchar *buf_encoded = text_invalid_utf8_to_encoding (buf, len, encoding, &buf_encoded_len); #ifdef USE_OPENSSL - if (!ssl) - ret = send (sok, buf, len, 0); - else - ret = _SSL_send (ssl, buf, len); + if (!ssl) + ret = send (sok, buf_encoded, buf_encoded_len, 0); + else + ret = _SSL_send (ssl, buf_encoded, buf_encoded_len); #else - ret = send (sok, buf, len, 0); + ret = send (sok, buf_encoded, buf_encoded_len, 0); #endif - } + g_free (buf_encoded); return ret; } @@ -287,94 +257,15 @@ close_socket (int sok) static void server_inline (server *serv, char *line, gssize len) { - char *utf_line_allocated = NULL; - - /* Checks whether we're set to use UTF-8 charset */ - if ((serv->encoding == NULL && prefs.utf8_locale) /* Using system default - UTF-8 */ || - g_ascii_strcasecmp (serv->encoding, "UTF8") == 0 || - g_ascii_strcasecmp (serv->encoding, "UTF-8") == 0 - ) - { - utf_line_allocated = text_validate (&line, &len); - } - else - { - /* Since the user has an explicit charset set, either - via /charset command or from his non-UTF8 locale, - we don't fallback to ISO-8859-1 and instead try to remove - errnoeous octets till the string is convertable in the - said charset. */ + gsize len_utf8; + line = text_invalid_encoding_to_utf8 (line, len, serv->encoding, &len_utf8); - const char *encoding = NULL; - - if (serv->encoding != NULL) - encoding = serv->encoding; - else - g_get_charset (&encoding); - - if (encoding != NULL) - { - char *conv_line; /* holds a copy of the original string */ - gsize conv_len; /* tells g_convert how much of line to convert */ - gsize utf_len; - gsize read_len; - GError *err; - gboolean retry; - - conv_line = g_malloc (len + 1); - memcpy (conv_line, line, len); - conv_line[len] = 0; - conv_len = len; - - /* if CP1255, convert it with the NUL terminator. - Works around SF bug #1122089 */ - if (serv->using_cp1255) - conv_len++; - - do - { - err = NULL; - retry = FALSE; - utf_line_allocated = g_convert_with_fallback (conv_line, conv_len, "UTF-8", encoding, "?", &read_len, &utf_len, &err); - if (err != NULL) - { - if (err->code == G_CONVERT_ERROR_ILLEGAL_SEQUENCE && conv_len > (read_len + 1)) - { - /* Make our best bet by removing the erroneous char. - This will work for casual 8-bit strings with non-standard chars. */ - memmove (conv_line + read_len, conv_line + read_len + 1, conv_len - read_len -1); - conv_len--; - retry = TRUE; - } - g_error_free (err); - } - } while (retry); - - g_free (conv_line); - - /* If any conversion has occured at all. Conversion might fail - due to errors other than invalid sequences, e.g. unknown charset. */ - if (utf_line_allocated != NULL) - { - line = utf_line_allocated; - len = utf_len; - if (serv->using_cp1255 && len > 0) - len--; - } - else - { - /* If all fails, treat as UTF-8 with fallback to ISO-8859-1. */ - utf_line_allocated = text_validate (&line, &len); - } - } - } - - fe_add_rawlog (serv, line, len, FALSE); + fe_add_rawlog (serv, line, len_utf8, FALSE); /* let proto-irc.c handle it */ - serv->p_inline (serv, line, len); + serv->p_inline (serv, line, len_utf8); - g_free (utf_line_allocated); + g_free (line); } /* read data from socket */ @@ -1749,12 +1640,7 @@ server_set_encoding (server *serv, char *new_encoding) { char *space; - if (serv->encoding) - { - g_free (serv->encoding); - /* can be left as NULL to indicate system encoding */ - serv->encoding = NULL; - } + g_free (serv->encoding); if (new_encoding) { @@ -1772,6 +1658,10 @@ server_set_encoding (server *serv, char *new_encoding) serv->encoding = g_strdup ("UTF-8"); } } + else + { + serv->encoding = g_strdup ("UTF-8"); + } } server * @@ -1816,6 +1706,8 @@ server_set_defaults (server *serv) serv->nick_prefixes = g_strdup ("@%+"); serv->nick_modes = g_strdup ("ohv"); + server_set_encoding (serv, "UTF-8"); + serv->nickcount = 1; serv->end_of_motd = FALSE; serv->is_away = FALSE; diff --git a/src/common/text.c b/src/common/text.c index 11a4595a..3f9d4441 100644 --- a/src/common/text.c +++ b/src/common/text.c @@ -658,33 +658,29 @@ log_open_or_close (session *sess) int get_stamp_str (char *fmt, time_t tim, char **ret) { - char *loc = NULL; char dest[128]; - gsize len; + gsize len_locale; + gsize len_utf8; - /* strftime wants the format string in LOCALE! */ - if (!prefs.utf8_locale) - { - const gchar *charset; + /* strftime requires the format string to be in locale encoding. */ + fmt = g_locale_from_utf8 (fmt, -1, NULL, NULL, NULL); - g_get_charset (&charset); - loc = g_convert_with_fallback (fmt, -1, charset, "UTF-8", "?", 0, 0, 0); - if (loc) - fmt = loc; - } + len_locale = strftime_validated (dest, sizeof (dest), fmt, localtime (&tim)); + + g_free (fmt); - len = strftime_validated (dest, sizeof (dest), fmt, localtime (&tim)); - if (len) + if (len_locale == 0) { - if (prefs.utf8_locale) - *ret = g_strdup (dest); - else - *ret = g_locale_to_utf8 (dest, len, 0, &len, 0); + return 0; } - g_free (loc); + *ret = g_locale_to_utf8 (dest, len_locale, NULL, &len_utf8, NULL); + if (*ret == NULL) + { + return 0; + } - return len; + return len_utf8; } static void @@ -753,154 +749,101 @@ log_write (session *sess, char *text, time_t ts) g_free (temp); } -/* converts a CP1252/ISO-8859-1(5) hybrid to UTF-8 */ -/* Features: 1. It never fails, all 00-FF chars are converted to valid UTF-8 */ -/* 2. Uses CP1252 in the range 80-9f because ISO doesn't have any- */ -/* thing useful in this range and it helps us receive from mIRC */ -/* 3. The five undefined chars in CP1252 80-9f are replaced with */ -/* ISO-8859-15 control codes. */ -/* 4. Handles 0xa4 as a Euro symbol ala ISO-8859-15. */ -/* 5. Uses ISO-8859-1 (which matches CP1252) for everything else. */ -/* 6. This routine measured 3x faster than g_convert :) */ - -static unsigned char * -iso_8859_1_to_utf8 (unsigned char *text, int len, gsize *bytes_written) +/** + * Converts a given string in from_encoding to to_encoding. This is similar to g_convert_with_fallback, except that it is tolerant of sequences in + * the original input that are invalid even in from_encoding. g_convert_with_fallback fails for such text, whereas this function replaces such a + * sequence with the fallback string. + * + * If len is -1, strlen(text) is used to calculate the length. Do not pass -1 if text is supposed to contain \0 bytes, such as if from_encoding is a + * multi-byte encoding like UTF-16. + */ +static gchar * +text_convert_invalid (const gchar* text, gssize len, const gchar *to_encoding, const gchar *from_encoding, const gchar *fallback, gsize *len_out) { - unsigned int idx; - unsigned char *res, *output; - static const unsigned short lowtable[] = /* 74 byte table for 80-a4 */ - { - /* compressed utf-8 table: if the first byte's 0x20 bit is set, it - indicates a 2-byte utf-8 sequence, otherwise prepend a 0xe2. */ - 0x82ac, /* 80 Euro. CP1252 from here on... */ - 0xe281, /* 81 NA */ - 0x809a, /* 82 */ - 0xe692, /* 83 */ - 0x809e, /* 84 */ - 0x80a6, /* 85 */ - 0x80a0, /* 86 */ - 0x80a1, /* 87 */ - 0xeb86, /* 88 */ - 0x80b0, /* 89 */ - 0xe5a0, /* 8a */ - 0x80b9, /* 8b */ - 0xe592, /* 8c */ - 0xe28d, /* 8d NA */ - 0xe5bd, /* 8e */ - 0xe28f, /* 8f NA */ - 0xe290, /* 90 NA */ - 0x8098, /* 91 */ - 0x8099, /* 92 */ - 0x809c, /* 93 */ - 0x809d, /* 94 */ - 0x80a2, /* 95 */ - 0x8093, /* 96 */ - 0x8094, /* 97 */ - 0xeb9c, /* 98 */ - 0x84a2, /* 99 */ - 0xe5a1, /* 9a */ - 0x80ba, /* 9b */ - 0xe593, /* 9c */ - 0xe29d, /* 9d NA */ - 0xe5be, /* 9e */ - 0xe5b8, /* 9f */ - 0xe2a0, /* a0 */ - 0xe2a1, /* a1 */ - 0xe2a2, /* a2 */ - 0xe2a3, /* a3 */ - 0x82ac /* a4 ISO-8859-15 Euro. */ - }; + gchar *result_part; + gsize result_part_len; + const gchar *end; + gsize invalid_start_pos; + GString *result; + const gchar *current_start; if (len == -1) + { len = strlen (text); + } - /* worst case scenario: every byte turns into 3 bytes */ - res = output = g_malloc ((len * 3) + 1); + end = text + len; - while (len) + /* Find the first position of an invalid sequence. */ + result_part = g_convert (text, len, to_encoding, from_encoding, &invalid_start_pos, &result_part_len, NULL); + if (result_part != NULL) { - if (G_LIKELY (*text < 0x80)) + /* All text converted successfully on the first try. Return it. */ + + if (len_out != NULL) { - *output = *text; /* ascii maps directly */ + *len_out = result_part_len; } - else if (*text <= 0xa4) /* 80-a4 use a lookup table */ + + return result_part; + } + + /* One or more invalid sequences exist that need to be replaced with the fallback. */ + + result = g_string_sized_new (len); + current_start = text; + + for (;;) + { + g_assert (current_start + invalid_start_pos < end); + + /* Convert everything before the position of the invalid sequence. It should be successful. */ + result_part = g_convert (current_start, invalid_start_pos, to_encoding, from_encoding, &invalid_start_pos, &result_part_len, NULL); + g_assert (result_part != NULL); + g_string_append_len (result, result_part, result_part_len); + g_free (result_part); + + /* Append the fallback */ + g_string_append (result, fallback); + + /* Now try converting everything after the invalid sequence. */ + current_start += invalid_start_pos + 1; + + result_part = g_convert (current_start, end - current_start, to_encoding, from_encoding, &invalid_start_pos, &result_part_len, NULL); + if (result_part != NULL) { - idx = *text - 0x80; - if (lowtable[idx] & 0x2000) - { - *output++ = (lowtable[idx] >> 8) & 0xdf; /* 2 byte utf-8 */ - *output = lowtable[idx] & 0xff; - } - else + /* The rest of the text converted successfully. Append it and return the whole converted text. */ + + g_string_append_len (result, result_part, result_part_len); + g_free (result_part); + + if (len_out != NULL) { - *output++ = 0xe2; /* 3 byte utf-8 */ - *output++ = (lowtable[idx] >> 8) & 0xff; - *output = lowtable[idx] & 0xff; + *len_out = result->len; } + + return g_string_free (result, FALSE); } - else if (*text < 0xc0) - { - *output++ = 0xc2; - *output = *text; - } - else - { - *output++ = 0xc3; - *output = *text - 0x40; - } - output++; - text++; - len--; - } - *output = 0; /* terminate */ - *bytes_written = output - res; - return res; + /* The rest of the text didn't convert successfully. invalid_start_pos has the position of the next invalid sequence. */ + } } -char * -text_validate (char **text, gssize *len) +gchar * +text_invalid_utf8_to_encoding (const gchar* text, gssize len, const gchar *to_encoding, gsize *len_out) { - char *utf; - gsize utf_len; - - /* valid utf8? */ - if (g_utf8_validate (*text, *len, 0)) - return NULL; - -#ifdef WIN32 - if (GetACP () == 1252) /* our routine is better than iconv's 1252 */ -#else - if (prefs.utf8_locale) -#endif - /* fallback to iso-8859-1 */ - utf = iso_8859_1_to_utf8 (*text, *len, &utf_len); - else - { - /* fallback to locale */ - utf = g_locale_to_utf8 (*text, *len, 0, &utf_len, NULL); - if (!utf) - utf = iso_8859_1_to_utf8 (*text, *len, &utf_len); - } - - if (!utf) - { - *text = g_strdup ("%INVALID%"); - *len = 9; - } else - { - *text = utf; - *len = utf_len; - } + return text_convert_invalid (text, len, to_encoding, "UTF-8", "?", len_out); +} - return utf; +gchar * +text_invalid_encoding_to_utf8 (const gchar* text, gssize len, const gchar *from_encoding, gsize *len_out) +{ + return text_convert_invalid (text, len, "UTF-8", from_encoding, "\357\277\275", len_out); } void PrintTextTimeStamp (session *sess, char *text, time_t timestamp) { - char *conv; - if (!sess) { if (!sess_list) @@ -909,22 +852,19 @@ PrintTextTimeStamp (session *sess, char *text, time_t timestamp) } /* make sure it's valid utf8 */ - if (text[0] == 0) + if (text[0] == '\0') { - text = "\n"; - conv = NULL; + text = g_strdup ("\n"); } else { - gssize len = -1; - conv = text_validate ((char **)&text, &len); + text = text_invalid_encoding_to_utf8 (text, -1, "UTF-8", NULL); } log_write (sess, text, timestamp); scrollback_save (sess, text); fe_print_text (sess, text, timestamp, FALSE); - - g_free (conv); + g_free (text); } void diff --git a/src/common/text.h b/src/common/text.h index 2534187e..4f47815d 100644 --- a/src/common/text.h +++ b/src/common/text.h @@ -57,7 +57,8 @@ void text_emit (int index, session *sess, char *a, char *b, char *c, char *d, time_t timestamp); int text_emit_by_name (char *name, session *sess, time_t timestamp, char *a, char *b, char *c, char *d); -char *text_validate (char **text, gssize *len); +gchar *text_invalid_utf8_to_encoding (const gchar* text, gssize len, const gchar *to_encoding, gsize *len_out); +gchar *text_invalid_encoding_to_utf8 (const gchar* text, gssize len, const gchar *from_encoding, gsize *len_out); int get_stamp_str (char *fmt, time_t tim, char **ret); void format_event (session *sess, int index, char **args, char *o, gsize sizeofo, unsigned int stripcolor_args); char *text_find_format_string (char *name); |