1 #include "unicodeandutf8.h"
2
enc_get_utf8_size(const unsigned char pInput)3 int enc_get_utf8_size(const unsigned char pInput)
4 {
5 unsigned char c = pInput;
6 if(c< 0x80) return 0;
7 if(c>=0x80 && c<0xC0) return -1;
8 if(c>=0xC0 && c<0xE0) return 2;
9 if(c>=0xE0 && c<0xF0) return 3;
10 if(c>=0xF0 && c<0xF8) return 4;
11 if(c>=0xF8 && c<0xFC) return 5;
12 if(c>=0xFC) return 6;
13 }
enc_utf8_to_unicode_one(const unsigned char * pInput,unsigned long * Unic)14 int enc_utf8_to_unicode_one(const unsigned char* pInput, unsigned long *Unic)
15 {
16 assert(pInput != NULL && Unic != NULL);
17 char b1, b2, b3, b4, b5, b6;
18 *Unic = 0x0;
19 int utfbytes = enc_get_utf8_size(*pInput);
20 unsigned char *pOutput = (unsigned char *) Unic;
21 switch ( utfbytes )
22 {
23 case 0:
24 *pOutput = *pInput;
25 utfbytes += 1;
26 break;
27 case 2:
28 b1 = *pInput;
29 b2 = *(pInput + 1);
30 if ( (b2 & 0xE0) != 0x80 )
31 return 0;
32 *pOutput = (b1 << 6) + (b2 & 0x3F);
33 *(pOutput+1) = (b1 >> 2) & 0x07;
34 break;
35 case 3:
36 b1 = *pInput;
37 b2 = *(pInput + 1);
38 b3 = *(pInput + 2);
39 if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) )
40 return 0;
41 *pOutput = (b2 << 6) + (b3 & 0x3F);
42 *(pOutput+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
43 break;
44 case 4:
45 b1 = *pInput;
46 b2 = *(pInput + 1);
47 b3 = *(pInput + 2);
48 b4 = *(pInput + 3);
49 if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
50 || ((b4 & 0xC0) != 0x80) )
51 return 0;
52 *pOutput = (b3 << 6) + (b4 & 0x3F);
53 *(pOutput+1) = (b2 << 4) + ((b3 >> 2) & 0x0F);
54 *(pOutput+2) = ((b1 << 2) & 0x1C) + ((b2 >> 4) & 0x03);
55 break;
56 case 5:
57 b1 = *pInput;
58 b2 = *(pInput + 1);
59 b3 = *(pInput + 2);
60 b4 = *(pInput + 3);
61 b5 = *(pInput + 4);
62 if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
63 || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) )
64 return 0;
65 *pOutput = (b4 << 6) + (b5 & 0x3F);
66 *(pOutput+1) = (b3 << 4) + ((b4 >> 2) & 0x0F);
67 *(pOutput+2) = (b2 << 2) + ((b3 >> 4) & 0x03);
68 *(pOutput+3) = (b1 << 6);
69 break;
70 case 6:
71 b1 = *pInput;
72 b2 = *(pInput + 1);
73 b3 = *(pInput + 2);
74 b4 = *(pInput + 3);
75 b5 = *(pInput + 4);
76 b6 = *(pInput + 5);
77 if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
78 || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)
79 || ((b6 & 0xC0) != 0x80) )
80 return 0;
81 *pOutput = (b5 << 6) + (b6 & 0x3F);
82 *(pOutput+1) = (b5 << 4) + ((b6 >> 2) & 0x0F);
83 *(pOutput+2) = (b3 << 2) + ((b4 >> 4) & 0x03);
84 *(pOutput+3) = ((b1 << 6) & 0x40) + (b2 & 0x3F);
85 break;
86 default:
87 return 0;
88 break;
89 }
90 return utfbytes;
91 }
enc_unicode_to_utf8_one(unsigned long unic,unsigned char * pOutput,int outSize)92 int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput,
93 int outSize)
94 {
95 assert(pOutput != NULL);
96 assert(outSize >= 6);
97
98 if ( unic <= 0x0000007F )
99 {
100 *pOutput = (unic & 0x7F);
101 return 1;
102 }
103 else if ( unic >= 0x00000080 && unic <= 0x000007FF )
104 {
105 *(pOutput+1) = (unic & 0x3F) | 0x80;
106 *pOutput = ((unic >> 6) & 0x1F) | 0xC0;
107 return 2;
108 }
109 else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
110 {
111 *(pOutput+2) = (unic & 0x3F) | 0x80;
112 *(pOutput+1) = ((unic >> 6) & 0x3F) | 0x80;
113 *pOutput = ((unic >> 12) & 0x0F) | 0xE0;
114 return 3;
115 }
116 else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
117 {
118 *(pOutput+3) = (unic & 0x3F) | 0x80;
119 *(pOutput+2) = ((unic >> 6) & 0x3F) | 0x80;
120 *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
121 *pOutput = ((unic >> 18) & 0x07) | 0xF0;
122 return 4;
123 }
124 else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
125 {
126 *(pOutput+4) = (unic & 0x3F) | 0x80;
127 *(pOutput+3) = ((unic >> 6) & 0x3F) | 0x80;
128 *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
129 *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
130 *pOutput = ((unic >> 24) & 0x03) | 0xF8;
131 return 5;
132 }
133 else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
134 {
135 *(pOutput+5) = (unic & 0x3F) | 0x80;
136 *(pOutput+4) = ((unic >> 6) & 0x3F) | 0x80;
137 *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
138 *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
139 *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
140 *pOutput = ((unic >> 30) & 0x01) | 0xFC;
141 return 6;
142 }
143
144 return 0;
145 }
146