mad_utf8.c

Go to the documentation of this file.
00001 
00033 #include "mad_utf8.h"
00034 
00035 /* Map from the most-significant 6 bits of the first byte to the total
00036 number of bytes in a UTF-8 character. */
00037 
00038 static char UTF8_2_ISO_8859_1_len[] =
00039   {
00040     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* erroneous */
00043     2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6
00044   };
00045 
00046 static char UTF8_2_ISO_8859_1_mask[] = {0x3F, 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01};
00047 
00048 int x_utf8s_to_iso_8859_1s(char *mbstr, const char *utf8str, size_t count) {
00049         int res = 0;
00050 
00051         while (*utf8str != '\0') {
00052                 int len = UTF8_2_ISO_8859_1_len[(*utf8str >> 2) & 0x3F];
00053                 unsigned long u = *utf8str & UTF8_2_ISO_8859_1_mask[len];
00054 
00055                 /* erroneous */
00056                 if(len == 0) {
00057                         len = 5;
00058                 }
00059 
00060                 for(++utf8str; --len > 0 && (*utf8str != '\0'); ++utf8str) {
00061                         /* be sure this is not an unexpected start of a new character */
00062                         if((*utf8str & 0xC0) != 0x80) {
00063                                 break;
00064                         }
00065 
00066                         u = (u << 6) | (*utf8str & 0x3F);
00067                 }
00068 
00069                 if(mbstr != 0 && count != 0) {
00070                         /* be sure there is enough space left in the destination buffer */
00071                         if(res >= (int)count) {
00072                                 return res;
00073                         }
00074 
00075                         /* add the mapped character to the destination string or '?' (0x1A, SUB) if character
00076                            can't be represented in ISO-8859-1 */
00077                         *mbstr++ = (u <= 0xFF ? (char)u : '?');
00078                 }
00079                 ++res;
00080         }
00081 
00082         /* add the terminating null character */
00083         if(mbstr != 0 && count != 0) {
00084                 // be sure there is enough space left in the destination buffer
00085                 if(res >= (int)count) {
00086                         return res;
00087                 }
00088 
00089                 *mbstr = 0;
00090         }
00091 
00092         return res;
00093 }
00094 
00095 int x_iso_8859_1s_to_utf8s(char *utf8str, const char *mbstr, size_t count) {
00096   
00097         int res = 0;
00098 
00099         /* loop until we reach the end of the mb string */
00100         for(; *mbstr != '\0'; ++mbstr) {
00101 
00102                 /* the character needs no mapping if the highest bit is not set */
00103                 if((*mbstr & 0x80) == 0) {
00104                         if(utf8str != 0 && count != 0) {
00105                                 /* be sure there is enough space left in the destination buffer */
00106                                 if(res >= (int)count) {
00107                                         return res;
00108                                 }
00109 
00110                                 *utf8str++ = *mbstr;
00111                         }
00112                         ++res;
00113                 }
00114 
00115                 /* otherwise mapping is necessary */
00116                 else {
00117                         if(utf8str != 0 && count != 0) {
00118                                 /* be sure there is enough space left in the destination buffer */
00119                                 if(res+1 >= (int)count) {
00120                                         return res;
00121                                 }
00122 
00123                                 *utf8str++ = (0xC0 | (0x03 & (*mbstr >> 6)));
00124                                 *utf8str++ = (0x80 | (0x3F & *mbstr));
00125                         }
00126                         res += 2;
00127                 }
00128         }
00129 
00130         /* add the terminating null character */
00131         if(utf8str != 0 && count != 0) {
00132                 /* be sure there is enough space left in the destination buffer */
00133                 if(res >= (int)count) {
00134                         return res;
00135                 }
00136 
00137                 *utf8str = 0;
00138         }
00139 
00140         return res;
00141 }

Generated on Fri Mar 9 19:59:53 2007 for MAD-FCL by  doxygen 1.5.0