32#ifndef _QORE_ENCODING_H
34#define _QORE_ENCODING_H
42#include <qore/QoreThreadLock.h>
50typedef size_t (*
mbcs_length_t)(
const char* str,
const char* end,
bool &invalid);
53typedef size_t (*
mbcs_end_t)(
const char* str,
const char* end,
size_t num_chars,
bool &invalid);
56typedef size_t (*
mbcs_pos_t)(
const char* str,
const char* ptr,
bool &invalid);
69hashdecl qore_encoding_private;
84 friend hashdecl qore_encoding_private;
87 qore_encoding_private* priv;
90 DLLLOCAL
QoreEncoding(
const char* code,
const char* desc =
nullptr,
unsigned char minwidth = 1,
103 DLLEXPORT
size_t getLength(
const char* p,
const char* end,
bool& invalid)
const;
122 DLLEXPORT
size_t getByteLen(
const char* p,
const char* end,
size_t c,
bool& invalid)
const;
141 DLLEXPORT
size_t getCharPos(
const char* p,
const char* end,
bool& invalid)
const;
197typedef std::map<const char*, QoreEncoding*, ltcstrcase> encoding_map_t;
198typedef std::map<const char*, const QoreEncoding*, ltcstrcase> const_encoding_map_t;
207 DLLLOCAL
static encoding_map_t emap;
208 DLLLOCAL
static const_encoding_map_t amap;
212 DLLLOCAL
static const QoreEncoding* findUnlocked(
const char* name);
233 DLLLOCAL
static void init(
const char* def);
DLLEXPORT const QoreEncoding * QCS_UTF16
UTF-16 (only UTF-* are multi-byte encodings)
Definition: QoreEncoding.h:248
DLLEXPORT const QoreEncoding * QCS_KOI7
Russian: Kod Obmena Informatsiey, 7 bit characters.
Definition: QoreEncoding.h:268
size_t(* mbcs_pos_t)(const char *str, const char *ptr, bool &invalid)
for multi-byte character set encodings: gives the character position of the ptr
Definition: QoreEncoding.h:56
DLLEXPORT const QoreEncoding * QCS_KOI8_R
Russian: Kod Obmena Informatsiey, 8 bit.
Definition: QoreEncoding.h:266
DLLEXPORT const QoreEncoding * QCS_KOI8_U
Ukrainian: Kod Obmena Informatsiey, 8 bit.
Definition: QoreEncoding.h:267
DLLEXPORT const QoreEncoding * QCS_ISO_8859_4
latin-4, Northern European character set
Definition: QoreEncoding.h:254
DLLEXPORT const QoreEncoding * QCS_ISO_8859_11
Thai character set.
Definition: QoreEncoding.h:261
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1258
Windows 1258: Vietnamese.
Definition: QoreEncoding.h:279
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1251
Windows 1251: Cyrillic: Russian, Ukrainian, Balarusian, Bulgarian, Serbian Cyrillic,...
Definition: QoreEncoding.h:272
DLLEXPORT const QoreEncoding * QCS_DEFAULT
the default encoding for the Qore library
DLLEXPORT const QoreEncoding * QCS_ISO_8859_15
latin-9, Western European with euro symbol
Definition: QoreEncoding.h:264
DLLEXPORT const QoreEncoding * QCS_UTF16LE
UTF-16LE (only UTF-* are multi-byte encodings)
Definition: QoreEncoding.h:250
DLLEXPORT const QoreEncoding * QCS_ISO_8859_6
Arabic character set.
Definition: QoreEncoding.h:256
DLLEXPORT const QoreEncoding * QCS_ISO_8859_8
Hebrew character set.
Definition: QoreEncoding.h:258
DLLEXPORT const QoreEncoding * QCS_UTF16BE
UTF-16BE (only UTF-* are multi-byte encodings)
Definition: QoreEncoding.h:249
DLLEXPORT const QoreEncoding * QCS_ISO_8859_9
latin-5, Turkish character set
Definition: QoreEncoding.h:259
DLLEXPORT const QoreEncoding * QCS_ISO_8859_7
Greek character set.
Definition: QoreEncoding.h:257
DLLEXPORT const QoreEncoding * QCS_ISO_8859_16
latin-10, Southeast European character set
Definition: QoreEncoding.h:265
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1252
Windows 1252: European: Spanish, French, German.
Definition: QoreEncoding.h:273
DLLEXPORT const QoreEncoding * QCS_WINDOWS_936
Windows 936: Simplified Chinese.
Definition: QoreEncoding.h:270
qore_offset_t(* mbcs_charlen_t)(const char *str, size_t valid_len)
for multi-byte encodings: gives the number of total bytes for the character given one or more charact...
Definition: QoreEncoding.h:63
DLLEXPORT const QoreEncoding * QCS_UTF8
UTF-8 multi-byte encoding (only UTF-8 and UTF-16 are multi-byte encodings)
Definition: QoreEncoding.h:247
DLLEXPORT const QoreEncoding * QCS_ISO_8859_2
latin-2, Central European encoding
Definition: QoreEncoding.h:252
DLLEXPORT const QoreEncoding * QCS_ISO_8859_10
latin-6, Nordic character set
Definition: QoreEncoding.h:260
DLLEXPORT const QoreEncoding * QCS_ISO_8859_1
latin-1, Western European encoding
Definition: QoreEncoding.h:251
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1255
Windows 1255: Hebrew.
Definition: QoreEncoding.h:276
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1253
Windows 1253: Greek.
Definition: QoreEncoding.h:274
DLLEXPORT const QoreEncoding * QCS_ISO_8859_5
Cyrillic character set.
Definition: QoreEncoding.h:255
size_t(* mbcs_end_t)(const char *str, const char *end, size_t num_chars, bool &invalid)
for multi-byte character set encodings: gives the number of bytes for the number of chars
Definition: QoreEncoding.h:53
DLLEXPORT QoreEncodingManager QEM
the QoreEncodingManager object
DLLEXPORT const QoreEncoding * QCS_ISO_8859_3
latin-3, Southern European character set
Definition: QoreEncoding.h:253
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1254
Windows 1254: Turkish.
Definition: QoreEncoding.h:275
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1257
Windows 1257: Baltic.
Definition: QoreEncoding.h:278
size_t(* mbcs_length_t)(const char *str, const char *end, bool &invalid)
for multi-byte character set encodings: gives the length of the string in characters
Definition: QoreEncoding.h:50
DLLEXPORT const QoreEncoding * QCS_USASCII
ascii encoding
Definition: QoreEncoding.h:246
unsigned(* mbcs_get_unicode_t)(const char *p)
returns the unicode code point for the given character, assumes there is enough data for the characte...
Definition: QoreEncoding.h:66
DLLEXPORT const QoreEncoding * QCS_WINDOWS_874
Windows 874: Latin/Thai - similar to ISO-8859-11.
Definition: QoreEncoding.h:269
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1250
Windows 1250: Central/Eastern European.
Definition: QoreEncoding.h:271
DLLEXPORT const QoreEncoding * QCS_ISO_8859_14
latin-8, Celtic character set
Definition: QoreEncoding.h:263
DLLEXPORT const QoreEncoding * QCS_WINDOWS_1256
Windows 1256: Arabic.
Definition: QoreEncoding.h:277
DLLEXPORT const QoreEncoding * QCS_ISO_8859_13
latin-7, Baltic rim character set
Definition: QoreEncoding.h:262
container for holding Qore-language exception information and also for registering a "thread_exit" ca...
Definition: ExceptionSink.h:50
defines string encoding functions in Qore
Definition: QoreEncoding.h:83
DLLEXPORT bool isMultiByte() const
returns true if the encoding is a multi-byte encoding
DLLEXPORT int getUnicode(const char *p, const char *end, unsigned &clen, ExceptionSink *xsink) const
returns the unicode code point for the given character; if there are any errors (invalid character,...
DLLEXPORT size_t getLength(const char *p, const char *end, ExceptionSink *xsink) const
gives the length of the string in characters
DLLEXPORT int getMaxCharWidth() const
returns the maximum character width in bytes for the encoding
DLLEXPORT const char * getDesc() const
returns the description for the encoding
DLLEXPORT unsigned getMinCharWidth() const
returns the minimum character width in bytes for the encoding
DLLEXPORT size_t getCharPos(const char *p, const char *end, bool &invalid) const
gives the character position (number of characters) starting from the first pointer to the second
DLLEXPORT size_t getByteLen(const char *p, const char *end, size_t c, bool &invalid) const
gives the number of bytes for the number of chars in the string or up to the end of the string
DLLEXPORT size_t getCharPos(const char *p, const char *end, ExceptionSink *xsink) const
gives the character position (number of characters) starting from the first pointer to the second
DLLEXPORT bool isAsciiCompat() const
returns true if the character encoding is backwards-compatible with ASCII
DLLEXPORT const char * getCode() const
returns the string code (ex: "UTF-8") for the encoding
DLLEXPORT size_t getByteLen(const char *p, const char *end, size_t c, ExceptionSink *xsink) const
gives the number of bytes for the number of chars in the string or up to the end of the string
DLLEXPORT qore_offset_t getCharLen(const char *p, size_t valid_len) const
gives the number of total bytes for the next character at the given pointer
DLLEXPORT size_t getLength(const char *p, const char *end, bool &invalid) const
gives the length of the string in characters
manages encodings in Qore
Definition: QoreEncoding.h:205
static DLLEXPORT void showAliases()
prints out all aliases to stdout
static DLLEXPORT const QoreEncoding * findCreate(const char *name)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn'...
static DLLEXPORT const QoreEncoding * add(const char *code, const char *desc=0, unsigned char maxwidth=1, mbcs_length_t l=0, mbcs_end_t e=0, mbcs_pos_t p=0, mbcs_charlen_t=0)
adds a new encoding to the list
static DLLEXPORT void showEncodings()
prints out all valid encodings to stdout
static DLLEXPORT const QoreEncoding * findCreate(const QoreString *str)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn'...
static DLLEXPORT void addAlias(const QoreEncoding *qcs, const char *alias)
adds an alias for an encoding
Qore's string type supported by the QoreEncoding class.
Definition: QoreString.h:93
provides a mutually-exclusive thread lock
Definition: QoreThreadLock.h:49
intptr_t qore_offset_t
used for offsets that could be negative
Definition: common.h:76