Abstract Unicode Text API.

Required Header

#include <utils_i18n.h>

Overview

Unicode Script Information.

Functions
int	i18n_uscript_get_codes (const char language, i18n_uscript_code_e codes, int32_t capacity, int32_t *length)
	Gets the script codes associated with the specified language.
const char *	i18n_uscript_get_name (i18n_uscript_code_e script_code)
	Gets the script name for the specified script code.
const char *	i18n_uscript_get_short_name (i18n_uscript_code_e script_code)
	Gets the short script name for the specified script code.
int	i18n_uscript_get_script (i18n_uchar32 codepoint, i18n_uscript_code_e *script_code)
	Gets the script code associated with the given codepoint.
i18n_ubool	i18n_uscript_has_script (i18n_uchar32 codepoint, i18n_uscript_code_e script_code)
	Gets a value indicating whether the Script Extensions of the specified codepoint contain the specified script.
int	i18n_uscript_get_script_extensions (i18n_uchar32 codepoint, i18n_uscript_code_e scripts, int32_t capacity, int32_t length)
	Gets the Script Extensions for the specified codepoint.
int	i18n_uscript_get_sample_string (i18n_uscript_code_e script, i18n_uchar sample, int32_t capacity, int32_t length)
	Gets the script sample character string.
i18n_uscript_usage_e	i18n_uscript_get_usage (i18n_uscript_code_e script)
	Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax.
i18n_ubool	i18n_uscript_is_right_to_left (i18n_uscript_code_e script)
	Gets a value indicating whether the script is written right-to-left.
i18n_ubool	i18n_uscript_breaks_between_letters (i18n_uscript_code_e script)
	Gets a value indicating whether the script allows line breaks between letters (excluding hyphenation).
i18n_ubool	i18n_uscript_is_cased (i18n_uscript_code_e script)
	Gets a value indicating whether the script case distinctions in modern usage are customary.

Enumeration Type Documentation

enum i18n_uscript_code_e

Constants for ISO 15924 script codes.

Since :: 2.4

Enumerator:

I18N_USCRIPT_COMMON	Zyyy
I18N_USCRIPT_INHERITED	Zinh, "Code for inherited script", for non-spacing combining marks; also Qaai
I18N_USCRIPT_ARABIC	Arab
I18N_USCRIPT_ARMENIAN	Armn
I18N_USCRIPT_BENGALI	Beng
I18N_USCRIPT_BOPOMOFO	Bopo
I18N_USCRIPT_CHEROKEE	Cher
I18N_USCRIPT_COPTIC	Copt
I18N_USCRIPT_CYRILLIC	Cyrl
I18N_USCRIPT_DESERET	Dsrt
I18N_USCRIPT_DEVANAGARI	Deva
I18N_USCRIPT_ETHIOPIC	Ethi
I18N_USCRIPT_GEORGIAN	Geor
I18N_USCRIPT_GOTHIC	Goth
I18N_USCRIPT_GREEK	Grek
I18N_USCRIPT_GUJARATI	Gujr
I18N_USCRIPT_GURMUKHI	Guru
I18N_USCRIPT_HAN	Hani
I18N_USCRIPT_HANGUL	Hang
I18N_USCRIPT_HEBREW	Hebr
I18N_USCRIPT_HIRAGANA	Hira
I18N_USCRIPT_KANNADA	Knda
I18N_USCRIPT_KATAKANA	Kana
I18N_USCRIPT_KHMER	Khmr
I18N_USCRIPT_LAO	Laoo
I18N_USCRIPT_LATIN	Latn
I18N_USCRIPT_MALAYALAM	Mlym
I18N_USCRIPT_MONGOLIAN	Mong
I18N_USCRIPT_MYANMAR	Mymr
I18N_USCRIPT_OGHAM	Ogam
I18N_USCRIPT_OLD_ITALIC	Ital
I18N_USCRIPT_ORIYA	Orya
I18N_USCRIPT_RUNIC	Runr
I18N_USCRIPT_SINHALA	Sinh
I18N_USCRIPT_SYRIAC	Syrc
I18N_USCRIPT_TAMIL	Taml
I18N_USCRIPT_TELUGU	Telu
I18N_USCRIPT_THAANA	Thaa
I18N_USCRIPT_THAI	Thai
I18N_USCRIPT_TIBETAN	Tibt
I18N_USCRIPT_CANADIAN_ABORIGINAL	Cans, Canadian_Aboriginal script.
I18N_USCRIPT_UCAS	Canadian_Aboriginal script (alias).
I18N_USCRIPT_YI	Yiii
I18N_USCRIPT_TAGALOG	Tglg
I18N_USCRIPT_HANUNOO	Hano
I18N_USCRIPT_BUHID	Buhd
I18N_USCRIPT_TAGBANWA	Tagb
I18N_USCRIPT_BRAILLE	Brai
I18N_USCRIPT_CYPRIOT	Cprt
I18N_USCRIPT_LIMBU	Limb
I18N_USCRIPT_LINEAR_B	Linb
I18N_USCRIPT_OSMANYA	Osma
I18N_USCRIPT_SHAVIAN	Shaw
I18N_USCRIPT_TAI_LE	Tale
I18N_USCRIPT_UGARITIC	Ugar
I18N_USCRIPT_KATAKANA_OR_HIRAGANA	Hrkt
I18N_USCRIPT_BUGINESE	Bugi
I18N_USCRIPT_GLAGOLITIC	Glag
I18N_USCRIPT_KHAROSHTHI	Khar
I18N_USCRIPT_SYLOTI_NAGRI	Sylo
I18N_USCRIPT_NEW_TAI_LUE	Talu
I18N_USCRIPT_TIFINAGH	Tfng
I18N_USCRIPT_OLD_PERSIAN	Xpeo
I18N_USCRIPT_BALINESE	Bali
I18N_USCRIPT_BATAK	Batk
I18N_USCRIPT_BLISSYMBOLS	Blis
I18N_USCRIPT_BRAHMI	Brah
I18N_USCRIPT_CHAM	Cham
I18N_USCRIPT_CIRTH	Cirt
I18N_USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC	Cyrs
I18N_USCRIPT_DEMOTIC_EGYPTIAN	Egyd
I18N_USCRIPT_HIERATIC_EGYPTIAN	Egyh
I18N_USCRIPT_EGYPTIAN_HIEROGLYPHS	Egyp
I18N_USCRIPT_KHUTSURI	Geok
I18N_USCRIPT_SIMPLIFIED_HAN	Hans
I18N_USCRIPT_TRADITIONAL_HAN	Hant
I18N_USCRIPT_PAHAWH_HMONG	Hmng
I18N_USCRIPT_OLD_HUNGARIAN	Hung
I18N_USCRIPT_HARAPPAN_INDUS	Inds
I18N_USCRIPT_JAVANESE	Java
I18N_USCRIPT_KAYAH_LI	Kali
I18N_USCRIPT_LATIN_FRAKTUR	Latf
I18N_USCRIPT_LATIN_GAELIC	Latg
I18N_USCRIPT_LEPCHA	Lepc
I18N_USCRIPT_LINEAR_A	Lina
I18N_USCRIPT_MANDAIC	Mand
I18N_USCRIPT_MAYAN_HIEROGLYPHS	Maya
I18N_USCRIPT_MEROITIC_HIEROGLYPHS	Mero
I18N_USCRIPT_NKO	Nkoo
I18N_USCRIPT_ORKHON	Orkh
I18N_USCRIPT_OLD_PERMIC	Perm
I18N_USCRIPT_PHAGS_PA	Phag
I18N_USCRIPT_PHOENICIAN	Phnx
I18N_USCRIPT_PHONETIC_POLLARD	Plrd
I18N_USCRIPT_RONGORONGO	Roro
I18N_USCRIPT_SARATI	Sara
I18N_USCRIPT_ESTRANGELO_SYRIAC	Syre
I18N_USCRIPT_WESTERN_SYRIAC	Syrj
I18N_USCRIPT_EASTERN_SYRIAC	Syrn
I18N_USCRIPT_TENGWAR	Teng
I18N_USCRIPT_VAI	Vaii
I18N_USCRIPT_VISIBLE_SPEECH	Visp
I18N_USCRIPT_CUNEIFORM	Xsux
I18N_USCRIPT_UNWRITTEN_LANGUAGES	Zxxx
I18N_USCRIPT_UNKNOWN	Zzzz, Unknown="Code for uncoded script", for unassigned code points
I18N_USCRIPT_CARIAN	Cari
I18N_USCRIPT_JAPANESE	Jpan
I18N_USCRIPT_LANNA	Lana
I18N_USCRIPT_LYCIAN	Lyci
I18N_USCRIPT_LYDIAN	Lydi
I18N_USCRIPT_OL_CHIKI	Olck
I18N_USCRIPT_REJANG	Rjng
I18N_USCRIPT_SAURASHTRA	Saur
I18N_USCRIPT_SIGN_WRITING	Sgnw
I18N_USCRIPT_SUNDANESE	Sund
I18N_USCRIPT_MOON	Moon
I18N_USCRIPT_MEITEI_MAYEK	Mtei
I18N_USCRIPT_IMPERIAL_ARAMAIC	Armi
I18N_USCRIPT_AVESTAN	Avst
I18N_USCRIPT_CHAKMA	Cakm
I18N_USCRIPT_KOREAN	Kore
I18N_USCRIPT_KAITHI	Kthi
I18N_USCRIPT_MANICHAEAN	Mani
I18N_USCRIPT_INSCRIPTIONAL_PAHLAVI	Phli
I18N_USCRIPT_PSALTER_PAHLAVI	Phlp
I18N_USCRIPT_BOOK_PAHLAVI	Phlv
I18N_USCRIPT_INSCRIPTIONAL_PARTHIAN	Prti
I18N_USCRIPT_SAMARITAN	Samr
I18N_USCRIPT_TAI_VIET	Tavt
I18N_USCRIPT_MATHEMATICAL_NOTATION	Zmth
I18N_USCRIPT_SYMBOLS	Zsym
I18N_USCRIPT_BAMUM	Bamu
I18N_USCRIPT_LISU	Lisu
I18N_USCRIPT_NAKHI_GEBA	Nkgb
I18N_USCRIPT_OLD_SOUTH_ARABIAN	Sarb
I18N_USCRIPT_BASSA_VAH	Bass
I18N_USCRIPT_DUPLOYAN_SHORTAND	Dupl
I18N_USCRIPT_ELBASAN	Elba
I18N_USCRIPT_GRANTHA	Gran
I18N_USCRIPT_KPELLE	Kpel
I18N_USCRIPT_LOMA	Loma
I18N_USCRIPT_MENDE	Mend
I18N_USCRIPT_MEROITIC_CURSIVE	Merc
I18N_USCRIPT_OLD_NORTH_ARABIAN	Narb
I18N_USCRIPT_NABATAEAN	Nbat
I18N_USCRIPT_PALMYRENE	Palm
I18N_USCRIPT_SINDHI	Sind
I18N_USCRIPT_WARANG_CITI	Wara
I18N_USCRIPT_AFAKA	Afak
I18N_USCRIPT_JURCHEN	Jurc
I18N_USCRIPT_MRO	Mroo
I18N_USCRIPT_NUSHU	Nshu
I18N_USCRIPT_SHARADA	Shrd
I18N_USCRIPT_SORA_SOMPENG	Sora
I18N_USCRIPT_TAKRI	Takr
I18N_USCRIPT_TANGUT	Tang
I18N_USCRIPT_WOLEAI	Wole
I18N_USCRIPT_ANATOLIAN_HIEROGLYPHS	Hluw
I18N_USCRIPT_KHOJKI	Khoj
I18N_USCRIPT_TIRHUTA	Tirh
I18N_USCRIPT_CAUCASIAN_ALBANIAN	Aghb (Since 6.0)
I18N_USCRIPT_MAHAJANI	Mahj (Since 6.0)
I18N_USCRIPT_AHOM	Ahom (Since 6.0)
I18N_USCRIPT_HATRAN	Hatr (Since 6.0)
I18N_USCRIPT_MODI	Modi (Since 6.0)
I18N_USCRIPT_MULTANI	Mult (Since 6.0)
I18N_USCRIPT_PAU_CIN_HAU	Pauc (Since 6.0)
I18N_USCRIPT_SIDDHAM	Sidd (Since 6.0)
I18N_USCRIPT_ADLAM	Adlm (Since 6.0)
I18N_USCRIPT_BHAIKSUKI	Bhks (Since 6.0)
I18N_USCRIPT_MARCHEN	Marc (Since 6.0)
I18N_USCRIPT_NEWA	Newa (Since 6.0)
I18N_USCRIPT_OSAGE	Osge (Since 6.0)
I18N_USCRIPT_HAN_WITH_BOPOMOFO	Hanb (Since 6.0)
I18N_USCRIPT_JAMO	Jamo (Since 6.0)
I18N_USCRIPT_SYMBOLS_EMOJI	Zsye (Since 6.0)
I18N_USCRIPT_MASARAM_GONDI	Gonm (Since 6.0)
I18N_USCRIPT_SOYOMBO	Soyo (Since 6.0)
I18N_USCRIPT_ZANABAZAR_SQUARE	Zanb (Since 6.0)
I18N_USCRIPT_DOGRA	Dogr (Since 6.0)
I18N_USCRIPT_GUNJALA_GONDI	Gong (Since 6.0)
I18N_USCRIPT_MAKASAR	Maka (Since 6.0)
I18N_USCRIPT_MEDEFAIDRIN	Medf (Since 6.0)
I18N_USCRIPT_HANIFI_ROHINGYA	Rohg (Since 6.0)
I18N_USCRIPT_SOGDIAN	Sogd (Since 6.0)
I18N_USCRIPT_OLD_SOGDIAN	Sogo (Since 6.0)
I18N_USCRIPT_ELYMAIC	Elym (Since 6.0)
I18N_USCRIPT_NYIAKENG_PUACHUE_HMONG	Hmnp (Since 6.0)
I18N_USCRIPT_NANDINAGARI	Nand (Since 6.0)
I18N_USCRIPT_WANCHO	Wcho (Since 6.0)
I18N_USCRIPT_CODE_LIMIT	Count of i18n_uscript_code_e enumerators

enum i18n_uscript_usage_e

Script usage constants. See UAX #31 Unicode Identifier and Pattern Syntax. http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers.

Since :: 6.0

Enumerator:

I18N_USCRIPT_USAGE_NOT_ENCODED	Not encoded in Unicode.
I18N_USCRIPT_USAGE_UNKNOWN	Unknown script usage.
I18N_USCRIPT_USAGE_EXCLUDED	Candidate for Exclusion from Identifiers.
I18N_USCRIPT_USAGE_LIMITED_USE	Limited Use script.
I18N_USCRIPT_USAGE_ASPIRATIONAL	Aspirational Use script.
I18N_USCRIPT_USAGE_RECOMMENDED	Recommended script.

Function Documentation

i18n_ubool i18n_uscript_breaks_between_letters ( i18n_uscript_code_e script )

Gets a value indicating whether the script allows line breaks between letters (excluding hyphenation).

Such a script typically requires dictionary-based line breaking. For example, Hani and Thai.

Since :: 6.0

Parameters:

[in] script Script code

Returns:: TRUE if the script allows line breaks between letters

int i18n_uscript_get_codes	(	const char *	language,
		i18n_uscript_code_e *	codes,
		int32_t	capacity,
		int32_t *	length
	)

Gets the script codes associated with the specified language.

The language is described using either locale, ISO 15924 name or ISO 15924 abbreviation. Example: If "Malayam" or "Mlym" is given, the expected result is I18N_USCRIPT_MALAYALAM. Note: To search by short or long script alias only, use i18n_uchar_get_property_value_enum() instead. That does a fast lookup with no access of the locale data.

Since :: 6.0

Remarks:: codes must be allocated before calling the function. If the required capacity is greater than the capacity of codes, then the length is set to the required capacity and I18N_ERROR_BUFFER_OVERFLOW is returned.

Parameters:

[in]	language	The language, for which the script codes are to be retrieved
[in,out]	codes	The array of codes associated with the specified language
[in]	capacity	Capacity of the codes array
[out]	length	The number of items written to the array, or the required capacity if the array's capacity is insufficient

Returns:: 0 on success, otherwise a negative error value

Return values:

I18N_ERROR_NONE	Successful
I18N_ERROR_INVALID_PARAMETER	Invalid function parameter
I18N_ERROR_OUT_OF_MEMORY	Out of memory
I18N_ERROR_BUFFER_OVERFLOW	The supplied array codes is of insufficient capacity

const char* i18n_uscript_get_name ( i18n_uscript_code_e script_code )

Gets the script name for the specified script code.

Returns the long unicode script name, if there is one. Otherwise returns the 4-letter ISO 15924 script code Example: If I18N_USCRIPT_MALAYALAM is given, the expected output is "Malayam".

Since :: 6.0

Parameters:

[in] script_code Uscript code enum

Returns:: long script name as given in PropertyValueAliases.txt, or the 4-letter code, or NULL if i18n_uscript_code_e is invalid

int i18n_uscript_get_sample_string	(	i18n_uscript_code_e	script,
		i18n_uchar *	sample,
		int32_t	capacity,
		int32_t *	length
	)

Gets the script sample character string.

This string normally consists of one code point but might be longer. The string is empty if the script is not encoded.

Since :: 6.0

Remarks:: sample must be allocated before calling the function. If the required capacity is greater than the capacity of sample, then the length is set to the required capacity and I18N_ERROR_BUFFER_OVERFLOW is returned.

Parameters:

[in]	script	Script code
[in,out]	sample	The sample string for the specified codepoint
[in]	capacity	The number of i18n_uchar characters that sample can hold
[out]	length	The number of i18n_uchar characters written to the string, or the required capacity if the string's capacity is insufficient

Returns:: 0 on success, otherwise a negative error value

Return values:

I18N_ERROR_NONE	Successful
I18N_ERROR_INVALID_PARAMETER	Invalid function parameter
I18N_ERROR_OUT_OF_MEMORY	Out of memory
I18N_ERROR_BUFFER_OVERFLOW	The supplied array sample is of insufficient capacity

int i18n_uscript_get_script	(	i18n_uchar32	codepoint,
		i18n_uscript_code_e *	script_code
	)

Gets the script code associated with the given codepoint.

If the specified codepoint is invalid, the script code returned is equal to 0. Example: If 0x0D02 is given, the expected output is I18N_USCRIPT_MALAYALAM.

Since :: 6.0

Parameters:

[in]	codepoint	i18n_uchar32 codepoint
[out]	script_code	The code of the script, that the specified codepoint belongs to.

Returns:: 0 on success, otherwise a negative error value

Return values:

I18N_ERROR_NONE	Successful
I18N_ERROR_INVALID_PARAMETER	Invalid function parameter
I18N_ERROR_OUT_OF_MEMORY	Out of memory

int i18n_uscript_get_script_extensions	(	i18n_uchar32	codepoint,
		i18n_uscript_code_e *	scripts,
		int32_t	capacity,
		int32_t *	length
	)

Gets the Script Extensions for the specified codepoint.

If codepoint does have Script Extensions, then the Script property value (normally Common or Inherited) is not included.
If codepoint does not have Script Extensions, then the one Script code is written to the output array.
If codepoint is not a valid code point, then the one I18N_USCRIPT_UNKNOWN code is written.

Some characters are commonly used in multiple scripts. For more information, see UAX #24: http://www.unicode.org/reports/tr24/.

Since :: 6.0

Remarks:: scripts must be allocated before calling the function. If the required capacity is greater than the capacity of scripts, then the length is set to the required capacity and I18N_ERROR_BUFFER_OVERFLOW is returned.

Parameters:

[in]	codepoint	Code point
[in,out]	scripts	The array of Script Extensions for the specified codepoint
[in]	capacity	Capacity of the scripts array
[out]	length	The number of items written to the array, or the required capacity if the array's capacity is insufficient

Returns:: 0 on success, otherwise a negative error value

Return values:

I18N_ERROR_NONE	Successful
I18N_ERROR_INVALID_PARAMETER	Invalid function parameter
I18N_ERROR_OUT_OF_MEMORY	Out of memory
I18N_ERROR_BUFFER_OVERFLOW	The supplied array scripts is of insufficient capacity

const char* i18n_uscript_get_short_name ( i18n_uscript_code_e script_code )

Gets the short script name for the specified script code.

Returns the 4-letter ISO 15924 script code, which is the same as the short Unicode script name if Unicode has names for the script. Example: If I18N_USCRIPT_MALAYALAM is given, the expected output is "Mlym".

Since :: 6.0

Parameters:

[in] script_code Uscript code enum

Returns:: short script name (4-letter code), or NULL if script_code is invalid

i18n_uscript_usage_e i18n_uscript_get_usage ( i18n_uscript_code_e script )

Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax.

Returns I18N_USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode.

Since :: 6.0

Parameters:

[in] script Script code

Returns:: script usage

i18n_ubool i18n_uscript_has_script	(	i18n_uchar32	codepoint,
		i18n_uscript_code_e	script_code
	)

Gets a value indicating whether the Script Extensions of the specified codepoint contain the specified script.

If codepoint does not have explicit Script Extensions, then this tests whether codepoint has the Script property value script_code. Some characters are commonly used in multiple scripts. For more information, see UAX #24: http://www.unicode.org/reports/tr24/.

Since :: 6.0

Parameters:

[in]	codepoint	Code point
[in]	script_code	Script code

Returns:: TRUE if script_code is in Script Extensions (codepoint)

i18n_ubool i18n_uscript_is_cased ( i18n_uscript_code_e script )

Gets a value indicating whether the script case distinctions in modern usage are customary.

For example, Latn and Cyrl.

Since :: 6.0

Parameters:

[in] script Script code

Returns:: TRUE if the script is cased

i18n_ubool i18n_uscript_is_right_to_left ( i18n_uscript_code_e script )

Gets a value indicating whether the script is written right-to-left.

For example, Arab and Hebr.

Since :: 6.0

Parameters:

[in] script Script code

Returns:: TRUE if the script is right-to-left

Required Header

Overview

Functions

Enumeration Type Documentation

Function Documentation