Qore Programming Language  0.9.16
QoreEncoding.h
Go to the documentation of this file.
1 /* -*- mode: c++; indent-tabs-mode: nil -*- */
2 /*
3  QoreEncoding.h
4 
5  Qore Programming Language
6 
7  Copyright (C) 2003 - 2017 Qore Technologies, s.r.o.
8 
9  Permission is hereby granted, free of charge, to any person obtaining a
10  copy of this software and associated documentation files (the "Software"),
11  to deal in the Software without restriction, including without limitation
12  the rights to use, copy, modify, merge, publish, distribute, sublicense,
13  and/or sell copies of the Software, and to permit persons to whom the
14  Software is furnished to do so, subject to the following conditions:
15 
16  The above copyright notice and this permission notice shall be included in
17  all copies or substantial portions of the Software.
18 
19  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  DEALINGS IN THE SOFTWARE.
26 
27  Note that the Qore library is released under a choice of three open-source
28  licenses: MIT (as above), LGPL 2+, or GPL 2+; see README-LICENSE for more
29  information.
30 */
31 
32 #ifndef _QORE_ENCODING_H
33 
34 #define _QORE_ENCODING_H
35 
41 #include <qore/common.h>
42 #include <qore/QoreThreadLock.h>
43 
44 #include <cstring>
45 #include <map>
46 #include <string>
47 #include <strings.h>
48 
50 typedef qore_size_t (*mbcs_length_t)(const char* str, const char* end, bool &invalid);
51 
53 typedef qore_size_t (*mbcs_end_t)(const char* str, const char* end, qore_size_t num_chars, bool &invalid);
54 
56 typedef qore_size_t (*mbcs_pos_t)(const char* str, const char* ptr, bool &invalid);
57 
59 
63 typedef qore_offset_t (*mbcs_charlen_t)(const char* str, qore_size_t valid_len);
64 
66 typedef unsigned (*mbcs_get_unicode_t)(const char* p);
67 
68 // private implementation of the QoreEncoding class
69 hashdecl qore_encoding_private;
70 
72 
83 class QoreEncoding {
84  friend hashdecl qore_encoding_private;
85 
86 protected:
87  // FIXME: move all this to the private implementation with the ABI change
88  // NOTE: the following class members cannot be removed because until Qore 0.8.12 this class implemented inline member functions
89  // that refered directly to these member variables, therefore they make up a part of the library's ABI :(
90  std::string code;
91  std::string desc;
92  mbcs_length_t flength;
93  mbcs_end_t fend;
94  mbcs_pos_t fpos;
95  mbcs_charlen_t fcharlen;
96  unsigned char maxwidth;
97 
98  qore_encoding_private* priv;
99 
100 public:
101  DLLLOCAL QoreEncoding(const char* n_code, const char* n_desc = 0, unsigned char n_minwidth = 1, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0, mbcs_get_unicode_t gu = 0, bool n_ascii_compat = true);
102 
103  DLLLOCAL ~QoreEncoding();
104 
106 
111  DLLEXPORT qore_size_t getLength(const char* p, const char* end, bool& invalid) const;
112 
114 
119  DLLEXPORT qore_size_t getLength(const char* p, const char* end, ExceptionSink* xsink) const;
120 
122 
128  DLLEXPORT qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, bool& invalid) const;
129 
131 
137  DLLEXPORT qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, ExceptionSink* xsink) const;
138 
140 
145  DLLEXPORT qore_size_t getCharPos(const char* p, const char* end, bool& invalid) const;
146 
148 
153  DLLEXPORT qore_size_t getCharPos(const char* p, const char* end, ExceptionSink* xsink) const;
154 
156 
162  DLLEXPORT qore_offset_t getCharLen(const char* p, qore_size_t valid_len) const;
163 
165  DLLEXPORT bool isMultiByte() const;
166 
168  DLLEXPORT const char* getCode() const;
169 
171  DLLEXPORT const char* getDesc() const;
172 
174  DLLEXPORT int getMaxCharWidth() const;
175 
177 
179  DLLEXPORT unsigned getMinCharWidth() const;
180 
182 
184  DLLEXPORT bool isAsciiCompat() const;
185 
187 
196  DLLEXPORT int getUnicode(const char* p, const char* end, unsigned& clen, ExceptionSink* xsink) const;
197 };
198 
199 // case-insensitive maps for encodings
200 typedef std::map<const char*, QoreEncoding*, ltcstrcase> encoding_map_t;
201 typedef std::map<const char*, const QoreEncoding*, ltcstrcase> const_encoding_map_t;
202 
203 class QoreString;
204 
206 
209 private:
210  DLLLOCAL static encoding_map_t emap;
211  DLLLOCAL static const_encoding_map_t amap;
212  DLLLOCAL static QoreThreadLock mutex;
213 
214  DLLLOCAL static const QoreEncoding* addUnlocked(const char* n_code, const char* n_desc = 0, unsigned char n_minwidth = 1, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0, mbcs_get_unicode_t gu = 0, bool n_ascii_compat = true);
215  DLLLOCAL static const QoreEncoding* findUnlocked(const char* name);
216 
217 public:
219  DLLEXPORT static void addAlias(const QoreEncoding* qcs, const char* alias);
220 
222  DLLEXPORT static const QoreEncoding* findCreate(const char* name);
223 
225  DLLEXPORT static const QoreEncoding* findCreate(const QoreString* str);
226 
228  DLLEXPORT static void showEncodings();
229 
231  DLLEXPORT static void showAliases();
232 
234  DLLEXPORT static const QoreEncoding* add(const char* code, const char* desc = 0, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
235 
236  DLLLOCAL static void init(const char* def);
237  DLLLOCAL QoreEncodingManager();
238  DLLLOCAL ~QoreEncodingManager();
239 };
240 
241 DLLEXPORT qore_size_t q_get_byte_len(const QoreEncoding* enc, const char* p, const char* end, qore_size_t c, ExceptionSink* xsink);
242 DLLEXPORT qore_offset_t q_get_char_len(const QoreEncoding* enc, const char* p, qore_size_t valid_len, ExceptionSink* xsink);
243 
245 DLLEXPORT extern QoreEncodingManager QEM;
246 
247 // builtin character encodings
248 DLLEXPORT extern const QoreEncoding* QCS_DEFAULT,
272 
273 #endif // _QORE_ENCODING_H
QCS_ISO_8859_7
const DLLEXPORT QoreEncoding * QCS_ISO_8859_7
Greek character set.
Definition: QoreEncoding.h:260
QoreEncodingManager::findCreate
static const DLLEXPORT QoreEncoding * findCreate(const char *name)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn'...
qore_offset_t
intptr_t qore_offset_t
used for offsets that could be negative
Definition: common.h:76
QCS_DEFAULT
const DLLEXPORT QoreEncoding * QCS_DEFAULT
the default encoding for the Qore library
mbcs_charlen_t
qore_offset_t(* mbcs_charlen_t)(const char *str, qore_size_t valid_len)
for multi-byte encodings: gives the number of total bytes for the character given one or more charact...
Definition: QoreEncoding.h:63
QoreEncoding::getMaxCharWidth
DLLEXPORT int getMaxCharWidth() const
returns the maximum character width in bytes for the encoding
QoreEncoding::getMinCharWidth
DLLEXPORT unsigned getMinCharWidth() const
returns the minimum character width in bytes for the encoding
QCS_ISO_8859_14
const DLLEXPORT QoreEncoding * QCS_ISO_8859_14
latin-8, Celtic character set
Definition: QoreEncoding.h:266
mbcs_pos_t
qore_size_t(* mbcs_pos_t)(const char *str, const char *ptr, bool &invalid)
for multi-byte character set encodings: gives the character position of the ptr
Definition: QoreEncoding.h:56
QCS_ISO_8859_5
const DLLEXPORT QoreEncoding * QCS_ISO_8859_5
Cyrillic character set.
Definition: QoreEncoding.h:258
QEM
DLLEXPORT QoreEncodingManager QEM
the QoreEncodingManager object
QCS_KOI7
const DLLEXPORT QoreEncoding * QCS_KOI7
Russian: Kod Obmena Informatsiey, 7 bit characters.
Definition: QoreEncoding.h:271
QCS_ISO_8859_15
const DLLEXPORT QoreEncoding * QCS_ISO_8859_15
latin-9, Western European with euro symbol
Definition: QoreEncoding.h:267
qore_size_t
size_t qore_size_t
used for sizes (same range as a pointer)
Definition: common.h:73
QoreEncoding::getByteLen
DLLEXPORT qore_size_t getByteLen(const char *p, const char *end, qore_size_t c, bool &invalid) const
gives the number of bytes for the number of chars in the string or up to the end of the string
QCS_UTF16LE
const DLLEXPORT QoreEncoding * QCS_UTF16LE
UTF-16LE (only UTF-8 and UTF-16* are multi-byte encodings)
Definition: QoreEncoding.h:253
QCS_UTF16
const DLLEXPORT QoreEncoding * QCS_UTF16
UTF-16 (only UTF-8 and UTF-16* are multi-byte encodings)
Definition: QoreEncoding.h:251
QCS_ISO_8859_16
const DLLEXPORT QoreEncoding * QCS_ISO_8859_16
latin-10, Southeast European character set
Definition: QoreEncoding.h:268
QoreEncoding::isMultiByte
DLLEXPORT bool isMultiByte() const
returns true if the encoding is a multi-byte encoding
QoreEncoding::getDesc
const DLLEXPORT char * getDesc() const
returns the description for the encoding
QoreEncodingManager
manages encodings in Qore
Definition: QoreEncoding.h:208
QoreEncodingManager::add
static const DLLEXPORT QoreEncoding * add(const char *code, const char *desc=0, unsigned char maxwidth=1, mbcs_length_t l=0, mbcs_end_t e=0, mbcs_pos_t p=0, mbcs_charlen_t=0)
adds a new encoding to the list
QoreString
Qore's string type supported by the QoreEncoding class.
Definition: QoreString.h:81
QoreEncoding::getUnicode
DLLEXPORT int getUnicode(const char *p, const char *end, unsigned &clen, ExceptionSink *xsink) const
returns the unicode code point for the given character; if there are any errors (invalid character,...
QCS_ISO_8859_4
const DLLEXPORT QoreEncoding * QCS_ISO_8859_4
latin-4, Northern European character set
Definition: QoreEncoding.h:257
QCS_ISO_8859_2
const DLLEXPORT QoreEncoding * QCS_ISO_8859_2
latin-2, Central European encoding
Definition: QoreEncoding.h:255
QCS_USASCII
const DLLEXPORT QoreEncoding * QCS_USASCII
ascii encoding
Definition: QoreEncoding.h:249
QCS_UTF8
const DLLEXPORT QoreEncoding * QCS_UTF8
UTF-8 multi-byte encoding (only UTF-8 and UTF-16 are multi-byte encodings)
Definition: QoreEncoding.h:250
mbcs_length_t
qore_size_t(* mbcs_length_t)(const char *str, const char *end, bool &invalid)
for multi-byte character set encodings: gives the length of the string in characters
Definition: QoreEncoding.h:50
QoreEncoding::getCode
const DLLEXPORT char * getCode() const
returns the string code (ex: "UTF-8") for the encoding
QCS_KOI8_U
const DLLEXPORT QoreEncoding * QCS_KOI8_U
Ukrainian: Kod Obmena Informatsiey, 8 bit.
Definition: QoreEncoding.h:270
QoreEncodingManager::addAlias
static DLLEXPORT void addAlias(const QoreEncoding *qcs, const char *alias)
adds an alias for an encoding
QCS_ISO_8859_10
const DLLEXPORT QoreEncoding * QCS_ISO_8859_10
latin-6, Nordic character set
Definition: QoreEncoding.h:263
QCS_UTF16BE
const DLLEXPORT QoreEncoding * QCS_UTF16BE
UTF-16BE (only UTF-8 and UTF-16* are multi-byte encodings)
Definition: QoreEncoding.h:252
ExceptionSink
container for holding Qore-language exception information and also for registering a "thread_exit" ca...
Definition: ExceptionSink.h:48
mbcs_end_t
qore_size_t(* mbcs_end_t)(const char *str, const char *end, qore_size_t num_chars, bool &invalid)
for multi-byte character set encodings: gives the number of bytes for the number of chars
Definition: QoreEncoding.h:53
QoreEncodingManager::showEncodings
static DLLEXPORT void showEncodings()
prints out all valid encodings to stdout
QCS_ISO_8859_11
const DLLEXPORT QoreEncoding * QCS_ISO_8859_11
Thai character set.
Definition: QoreEncoding.h:264
common.h
QoreEncoding::getCharPos
DLLEXPORT qore_size_t getCharPos(const char *p, const char *end, bool &invalid) const
gives the character position (number of characters) starting from the first pointer to the second
QoreThreadLock
provides a mutually-exclusive thread lock
Definition: QoreThreadLock.h:49
QCS_ISO_8859_13
const DLLEXPORT QoreEncoding * QCS_ISO_8859_13
latin-7, Baltic rim character set
Definition: QoreEncoding.h:265
QoreEncoding::getCharLen
DLLEXPORT qore_offset_t getCharLen(const char *p, qore_size_t valid_len) const
gives the number of total bytes for the next character at the given pointer
mbcs_get_unicode_t
unsigned(* mbcs_get_unicode_t)(const char *p)
returns the unicode code point for the given character, assumes there is enough data for the characte...
Definition: QoreEncoding.h:66
QoreEncodingManager::showAliases
static DLLEXPORT void showAliases()
prints out all aliases to stdout
QCS_KOI8_R
const DLLEXPORT QoreEncoding * QCS_KOI8_R
Russian: Kod Obmena Informatsiey, 8 bit.
Definition: QoreEncoding.h:269
QCS_ISO_8859_3
const DLLEXPORT QoreEncoding * QCS_ISO_8859_3
latin-3, Southern European character set
Definition: QoreEncoding.h:256
QCS_ISO_8859_9
const DLLEXPORT QoreEncoding * QCS_ISO_8859_9
latin-5, Turkish character set
Definition: QoreEncoding.h:262
QCS_ISO_8859_8
const DLLEXPORT QoreEncoding * QCS_ISO_8859_8
Hebrew character set.
Definition: QoreEncoding.h:261
QoreEncoding
defines string encoding functions in Qore
Definition: QoreEncoding.h:83
QoreEncoding::isAsciiCompat
DLLEXPORT bool isAsciiCompat() const
returns true if the character encoding is backwards-compatible with ASCII
QCS_ISO_8859_1
const DLLEXPORT QoreEncoding * QCS_ISO_8859_1
latin-1, Western European encoding
Definition: QoreEncoding.h:254
QCS_ISO_8859_6
const DLLEXPORT QoreEncoding * QCS_ISO_8859_6
Arabic character set.
Definition: QoreEncoding.h:259
QoreEncoding::getLength
DLLEXPORT qore_size_t getLength(const char *p, const char *end, bool &invalid) const
gives the length of the string in characters