Qore Programming Language 1.14.0
Loading...
Searching...
No Matches
QoreEncoding.h
Go to the documentation of this file.
1/* -*- mode: c++; indent-tabs-mode: nil -*- */
2/*
3 QoreEncoding.h
4
5 Qore Programming Language
6
7 Copyright (C) 2003 - 2023 Qore Technologies, s.r.o.
8
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files (the "Software"),
11 to deal in the Software without restriction, including without limitation
12 the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 and/or sell copies of the Software, and to permit persons to whom the
14 Software is furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.
26
27 Note that the Qore library is released under a choice of three open-source
28 licenses: MIT (as above), LGPL 2+, or GPL 2+; see README-LICENSE for more
29 information.
30*/
31
32#ifndef _QORE_ENCODING_H
33
34#define _QORE_ENCODING_H
35
41#include <qore/common.h>
42#include <qore/QoreThreadLock.h>
43
44#include <cstring>
45#include <map>
46#include <string>
47#include <strings.h>
48
50typedef size_t (*mbcs_length_t)(const char* str, const char* end, bool &invalid);
51
53typedef size_t (*mbcs_end_t)(const char* str, const char* end, size_t num_chars, bool &invalid);
54
56typedef size_t (*mbcs_pos_t)(const char* str, const char* ptr, bool &invalid);
57
59
63typedef qore_offset_t (*mbcs_charlen_t)(const char* str, size_t valid_len);
64
66typedef unsigned (*mbcs_get_unicode_t)(const char* p);
67
68// private implementation of the QoreEncoding class
69hashdecl qore_encoding_private;
70
72
84 friend hashdecl qore_encoding_private;
85
86protected:
87 qore_encoding_private* priv;
88
89public:
90 DLLLOCAL QoreEncoding(const char* code, const char* desc = nullptr, unsigned char minwidth = 1,
91 unsigned char maxwidth = 1, mbcs_length_t l = nullptr, mbcs_end_t e = nullptr, mbcs_pos_t p = nullptr,
92 mbcs_charlen_t c = nullptr, mbcs_get_unicode_t gu = nullptr, bool ascii_compat = true);
93
94 DLLLOCAL ~QoreEncoding();
95
97
103 DLLEXPORT size_t getLength(const char* p, const char* end, bool& invalid) const;
104
106
112 DLLEXPORT size_t getLength(const char* p, const char* end, ExceptionSink* xsink) const;
113
115
122 DLLEXPORT size_t getByteLen(const char* p, const char* end, size_t c, bool& invalid) const;
123
125
132 DLLEXPORT size_t getByteLen(const char* p, const char* end, size_t c, ExceptionSink* xsink) const;
133
135
141 DLLEXPORT size_t getCharPos(const char* p, const char* end, bool& invalid) const;
142
144
150 DLLEXPORT size_t getCharPos(const char* p, const char* end, ExceptionSink* xsink) const;
151
153
159 DLLEXPORT qore_offset_t getCharLen(const char* p, size_t valid_len) const;
160
162 DLLEXPORT bool isMultiByte() const;
163
165 DLLEXPORT const char* getCode() const;
166
168 DLLEXPORT const char* getDesc() const;
169
171 DLLEXPORT int getMaxCharWidth() const;
172
174
176 DLLEXPORT unsigned getMinCharWidth() const;
177
179
181 DLLEXPORT bool isAsciiCompat() const;
182
184
193 DLLEXPORT int getUnicode(const char* p, const char* end, unsigned& clen, ExceptionSink* xsink) const;
194};
195
196// case-insensitive maps for encodings
197typedef std::map<const char*, QoreEncoding*, ltcstrcase> encoding_map_t;
198typedef std::map<const char*, const QoreEncoding*, ltcstrcase> const_encoding_map_t;
199
200class QoreString;
201
203
206private:
207 DLLLOCAL static encoding_map_t emap;
208 DLLLOCAL static const_encoding_map_t amap;
209 DLLLOCAL static QoreThreadLock mutex;
210
211 DLLLOCAL static const QoreEncoding* addUnlocked(const char* n_code, const char* n_desc = 0, unsigned char n_minwidth = 1, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0, mbcs_get_unicode_t gu = 0, bool n_ascii_compat = true);
212 DLLLOCAL static const QoreEncoding* findUnlocked(const char* name);
213
214public:
216 DLLEXPORT static void addAlias(const QoreEncoding* qcs, const char* alias);
217
219 DLLEXPORT static const QoreEncoding* findCreate(const char* name);
220
222 DLLEXPORT static const QoreEncoding* findCreate(const QoreString* str);
223
225 DLLEXPORT static void showEncodings();
226
228 DLLEXPORT static void showAliases();
229
231 DLLEXPORT static const QoreEncoding* add(const char* code, const char* desc = 0, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
232
233 DLLLOCAL static void init(const char* def);
234 DLLLOCAL QoreEncodingManager();
235 DLLLOCAL ~QoreEncodingManager();
236};
237
238DLLEXPORT size_t q_get_byte_len(const QoreEncoding* enc, const char* p, const char* end, size_t c, ExceptionSink* xsink);
239DLLEXPORT qore_offset_t q_get_char_len(const QoreEncoding* enc, const char* p, size_t valid_len, ExceptionSink* xsink);
240
242DLLEXPORT extern QoreEncodingManager QEM;
243
244// builtin character encodings
245DLLEXPORT extern const QoreEncoding* QCS_DEFAULT,
280
281#endif // _QORE_ENCODING_H
DLLEXPORT const QoreEncoding * QCS_UTF16
UTF-16 (only UTF-* are multi-byte encodings)
Definition: QoreEncoding.h:248
DLLEXPORT const QoreEncoding * QCS_KOI7
Russian: Kod Obmena Informatsiey, 7 bit characters.
Definition: QoreEncoding.h:268
size_t(* mbcs_pos_t)(const char *str, const char *ptr, bool &invalid)
for multi-byte character set encodings: gives the character position of the ptr
Definition: QoreEncoding.h:56
DLLEXPORT const QoreEncoding * QCS_KOI8_R
Russian: Kod Obmena Informatsiey, 8 bit.
Definition: QoreEncoding.h:266
DLLEXPORT const QoreEncoding * QCS_KOI8_U
Ukrainian: Kod Obmena Informatsiey, 8 bit.
Definition: QoreEncoding.h:267
DLLEXPORT const QoreEncoding * QCS_ISO_8859_4
latin-4, Northern European character set
Definition: QoreEncoding.h:254
DLLEXPORT const QoreEncoding * QCS_ISO_8859_11
Thai character set.
Definition: QoreEncoding.h:261
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1258
Windows 1258: Vietnamese.
Definition: QoreEncoding.h:279
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1251
Windows 1251: Cyrillic: Russian, Ukrainian, Balarusian, Bulgarian, Serbian Cyrillic,...
Definition: QoreEncoding.h:272
DLLEXPORT const QoreEncoding * QCS_DEFAULT
the default encoding for the Qore library
DLLEXPORT const QoreEncoding * QCS_ISO_8859_15
latin-9, Western European with euro symbol
Definition: QoreEncoding.h:264
DLLEXPORT const QoreEncoding * QCS_UTF16LE
UTF-16LE (only UTF-* are multi-byte encodings)
Definition: QoreEncoding.h:250
DLLEXPORT const QoreEncoding * QCS_ISO_8859_6
Arabic character set.
Definition: QoreEncoding.h:256
DLLEXPORT const QoreEncoding * QCS_ISO_8859_8
Hebrew character set.
Definition: QoreEncoding.h:258
DLLEXPORT const QoreEncoding * QCS_UTF16BE
UTF-16BE (only UTF-* are multi-byte encodings)
Definition: QoreEncoding.h:249
DLLEXPORT const QoreEncoding * QCS_ISO_8859_9
latin-5, Turkish character set
Definition: QoreEncoding.h:259
DLLEXPORT const QoreEncoding * QCS_ISO_8859_7
Greek character set.
Definition: QoreEncoding.h:257
DLLEXPORT const QoreEncoding * QCS_ISO_8859_16
latin-10, Southeast European character set
Definition: QoreEncoding.h:265
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1252
Windows 1252: European: Spanish, French, German.
Definition: QoreEncoding.h:273
DLLEXPORT const QoreEncoding * QCS_WINDOWS_936
Windows 936: Simplified Chinese.
Definition: QoreEncoding.h:270
qore_offset_t(* mbcs_charlen_t)(const char *str, size_t valid_len)
for multi-byte encodings: gives the number of total bytes for the character given one or more charact...
Definition: QoreEncoding.h:63
DLLEXPORT const QoreEncoding * QCS_UTF8
UTF-8 multi-byte encoding (only UTF-8 and UTF-16 are multi-byte encodings)
Definition: QoreEncoding.h:247
DLLEXPORT const QoreEncoding * QCS_ISO_8859_2
latin-2, Central European encoding
Definition: QoreEncoding.h:252
DLLEXPORT const QoreEncoding * QCS_ISO_8859_10
latin-6, Nordic character set
Definition: QoreEncoding.h:260
DLLEXPORT const QoreEncoding * QCS_ISO_8859_1
latin-1, Western European encoding
Definition: QoreEncoding.h:251
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1255
Windows 1255: Hebrew.
Definition: QoreEncoding.h:276
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1253
Windows 1253: Greek.
Definition: QoreEncoding.h:274
DLLEXPORT const QoreEncoding * QCS_ISO_8859_5
Cyrillic character set.
Definition: QoreEncoding.h:255
size_t(* mbcs_end_t)(const char *str, const char *end, size_t num_chars, bool &invalid)
for multi-byte character set encodings: gives the number of bytes for the number of chars
Definition: QoreEncoding.h:53
DLLEXPORT QoreEncodingManager QEM
the QoreEncodingManager object
DLLEXPORT const QoreEncoding * QCS_ISO_8859_3
latin-3, Southern European character set
Definition: QoreEncoding.h:253
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1254
Windows 1254: Turkish.
Definition: QoreEncoding.h:275
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1257
Windows 1257: Baltic.
Definition: QoreEncoding.h:278
size_t(* mbcs_length_t)(const char *str, const char *end, bool &invalid)
for multi-byte character set encodings: gives the length of the string in characters
Definition: QoreEncoding.h:50
DLLEXPORT const QoreEncoding * QCS_USASCII
ascii encoding
Definition: QoreEncoding.h:246
unsigned(* mbcs_get_unicode_t)(const char *p)
returns the unicode code point for the given character, assumes there is enough data for the characte...
Definition: QoreEncoding.h:66
DLLEXPORT const QoreEncoding * QCS_WINDOWS_874
Windows 874: Latin/Thai - similar to ISO-8859-11.
Definition: QoreEncoding.h:269
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1250
Windows 1250: Central/Eastern European.
Definition: QoreEncoding.h:271
DLLEXPORT const QoreEncoding * QCS_ISO_8859_14
latin-8, Celtic character set
Definition: QoreEncoding.h:263
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1256
Windows 1256: Arabic.
Definition: QoreEncoding.h:277
DLLEXPORT const QoreEncoding * QCS_ISO_8859_13
latin-7, Baltic rim character set
Definition: QoreEncoding.h:262
container for holding Qore-language exception information and also for registering a "thread_exit" ca...
Definition: ExceptionSink.h:50
defines string encoding functions in Qore
Definition: QoreEncoding.h:83
DLLEXPORT bool isMultiByte() const
returns true if the encoding is a multi-byte encoding
DLLEXPORT int getUnicode(const char *p, const char *end, unsigned &clen, ExceptionSink *xsink) const
returns the unicode code point for the given character; if there are any errors (invalid character,...
DLLEXPORT size_t getLength(const char *p, const char *end, ExceptionSink *xsink) const
gives the length of the string in characters
DLLEXPORT int getMaxCharWidth() const
returns the maximum character width in bytes for the encoding
DLLEXPORT const char * getDesc() const
returns the description for the encoding
DLLEXPORT unsigned getMinCharWidth() const
returns the minimum character width in bytes for the encoding
DLLEXPORT size_t getCharPos(const char *p, const char *end, bool &invalid) const
gives the character position (number of characters) starting from the first pointer to the second
DLLEXPORT size_t getByteLen(const char *p, const char *end, size_t c, bool &invalid) const
gives the number of bytes for the number of chars in the string or up to the end of the string
DLLEXPORT size_t getCharPos(const char *p, const char *end, ExceptionSink *xsink) const
gives the character position (number of characters) starting from the first pointer to the second
DLLEXPORT bool isAsciiCompat() const
returns true if the character encoding is backwards-compatible with ASCII
DLLEXPORT const char * getCode() const
returns the string code (ex: "UTF-8") for the encoding
DLLEXPORT size_t getByteLen(const char *p, const char *end, size_t c, ExceptionSink *xsink) const
gives the number of bytes for the number of chars in the string or up to the end of the string
DLLEXPORT qore_offset_t getCharLen(const char *p, size_t valid_len) const
gives the number of total bytes for the next character at the given pointer
DLLEXPORT size_t getLength(const char *p, const char *end, bool &invalid) const
gives the length of the string in characters
manages encodings in Qore
Definition: QoreEncoding.h:205
static DLLEXPORT void showAliases()
prints out all aliases to stdout
static DLLEXPORT const QoreEncoding * findCreate(const char *name)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn'...
static DLLEXPORT const QoreEncoding * add(const char *code, const char *desc=0, unsigned char maxwidth=1, mbcs_length_t l=0, mbcs_end_t e=0, mbcs_pos_t p=0, mbcs_charlen_t=0)
adds a new encoding to the list
static DLLEXPORT void showEncodings()
prints out all valid encodings to stdout
static DLLEXPORT const QoreEncoding * findCreate(const QoreString *str)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn'...
static DLLEXPORT void addAlias(const QoreEncoding *qcs, const char *alias)
adds an alias for an encoding
Qore's string type supported by the QoreEncoding class.
Definition: QoreString.h:93
provides a mutually-exclusive thread lock
Definition: QoreThreadLock.h:49
intptr_t qore_offset_t
used for offsets that could be negative
Definition: common.h:76