Python bindings for ICU4C using pybind11.
-
Naming Conventions
Renamed functions, methods, and enums to conform to PEP 8.
- Function Names:
use
lower_case_with_underscoresstyle. - Method Names:
use
lower_case_with_underscoresstyle. Also, use one leading underscore only for protected methods. - C++ Enum Member Names:
use
UPPER_CASE_WITH_UNDERSCORESstyle without a leading "k". (e.g.,kDateOffset→DATE_OFFSET) - APIs that match Python reserved words: e.g.,
with()→with_()
- Function Names:
use
-
Error Handling
-
ICU C/C++ API errors are raised as
icupy.icu.ICUErrorexceptions. The underlyingUErrorCodecan be retrieved from theerror_codeattribute of the exception.For example:
from icupy import icu try: pass # Call ICU API here... except icu.ICUError as e: print(e.error_code) # → icu.ErrorCode print(e.error_code.get()) # → icu.UErrorCode
-
-
icu::UnicodeString with predefined error callback function
# from Unicode to codepage from icupy import icu cnv = icu.ucnv_open("iso8859-1") context = icu.UserContext(icu.UCNV_ESCAPE_C) # \uXXXX action = icu.UConverterFromUCallback(icu.UCNV_FROM_U_CALLBACK_ESCAPE, context) old_action = icu.ucnv_set_from_u_call_back(cnv, action) s = icu.UnicodeString("A€B") s.extract(cnv) # → b'A\\u20ACB'
# from codepage to Unicode from icupy import icu cnv = icu.ucnv_open("Shift-JIS") context = icu.UserContext(icu.UCNV_ESCAPE_XML_HEX) # &#xXXXX; action = icu.UConverterToUCallback(icu.UCNV_TO_U_CALLBACK_ESCAPE, context) old_action = icu.ucnv_set_to_u_call_back(cnv, action) src = b"\x61\xeb\x40\x62" # 0xeb 0x40: UNASSIGNED SEQUENCE s = icu.UnicodeString(src, -1, cnv) str(s) # → 'aë@b'
-
icu::UnicodeString with custom error callback function
# from Unicode to codepage from icupy import icu from icupy.utils import gc def from_unicode_cb( options: object, args: icu.UConverterFromUnicodeArgs, code_units: str, length: int, code_point: int, reason: icu.UConverterCallbackReason, error_code: icu.ErrorCode, ) -> None: _ = options, length, code_point # unused if reason in [icu.UCNV_UNASSIGNED, icu.UCNV_ILLEGAL, icu.UCNV_IRREGULAR]: error_code.set(icu.U_ZERO_ERROR) source = "".join(f"\\u{ord(c):04x}" for c in code_units) icu.ucnv_cb_from_u_write_bytes(args, source, len(source), 0) with gc(icu.ucnv_open("iso8859-1"), icu.ucnv_close) as cnv: action = icu.UConverterFromUCallback(from_unicode_cb) old_action = icu.ucnv_set_from_u_call_back(cnv, action) s = icu.UnicodeString("A€B") s.extract(cnv) # → b'A\\u20acB'
# from codepage to Unicode from icupy import icu from icupy.utils import gc def to_unicode_cb( options: object, args: icu.UConverterToUnicodeArgs, code_units: bytes, length: int, reason: icu.UConverterCallbackReason, error_code: icu.ErrorCode, ) -> None: _ = options, length # unused if reason in [icu.UCNV_UNASSIGNED, icu.UCNV_ILLEGAL, icu.UCNV_IRREGULAR]: error_code.set(icu.U_ZERO_ERROR) source = "".join(f"%{b:02X}" for b in code_units) icu.ucnv_cb_to_u_write_uchars(args, source, len(source), 0) with gc(icu.ucnv_open("Shift-JIS"), icu.ucnv_close) as cnv: action = icu.UConverterToUCallback(to_unicode_cb) old_action = icu.ucnv_set_to_u_call_back(cnv, action) src = b"\x61\xeb\x40\x62" # 0xeb 0x40: UNASSIGNED SEQUENCE s = icu.UnicodeString(src, -1, cnv) str(s) # → 'a%EB%40b'
-
icu::BreakIterator for word-breaks
from icupy import icu bi = icu.BreakIterator.create_word_instance("en_US") src = icu.UnicodeString("Alice was beginning to get very tired of sitting by her sister on the bank.") bi.set_text(src) result = [] start = bi.first() while (end := bi.next()) != icu.UBRK_DONE: if bi.get_rule_status() != icu.UBRK_WORD_NONE: result.append(src[start:end]) start = end # result: ['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank']
-
Natural sort (human-friendly sorting)
from icupy import icu coll = icu.Collator.create_instance("en_US") coll.set_attribute(icu.UCOL_NUMERIC_COLLATION, icu.UCOL_ON) data = ["file1.txt", "file10.txt", "file2.txt", "file20.txt", "file3.txt"] sorted(data, key=coll.get_sort_key) # ['file1.txt', 'file2.txt', 'file3.txt', 'file10.txt', 'file20.txt']
-
icu::IDNA (UTS #46)
from icupy import icu uts46 = icu.IDNA.create_uts46_instance(icu.UIDNA_DEFAULT | icu.UIDNA_CHECK_BIDI | icu.UIDNA_CHECK_CONTEXTJ) dest = icu.UnicodeString() info = icu.IDNAInfo() # a + ZERO WIDTH NON-JOINER + b.com uts46.name_to_ascii("a\u200cb.com", dest, info) # → 'xn--ab-j1t.com' bool(info.get_errors() & icu.UIDNA_ERROR_BIDI) # → False bool(info.get_errors() & icu.UIDNA_ERROR_CONTEXTJ) # → True
-
icu::number::NumberFormatter (ICU 60+)
from icupy import icu from icupy.icu import number template = ( number.NumberFormatter.with_() .notation(number.Notation.compact_short()) .unit(icu.CurrencyUnit("EUR")) .precision(number.Precision.max_significant_digits(2)) ) template.locale("en_US").format_int(1234).to_string() # "€1.2K" in en-US
-
icu::RegexMatcher::find with custom callback function
from icupy import icu src = icu.UnicodeString("aaaaaaaaaaaaaaaaaaab") matcher = icu.RegexMatcher("((.)\\2)x", src, 0) def progress_callback(options: dict[str, int], match_index: int) -> bool: if not isinstance(options, dict): return False calls = options.get("numCalls", 0) + 1 options["numCalls"] = calls options["lastIndex"] = match_index max_calls = options.get("maxCalls", -1) return True if max_calls < 0 else calls < max_calls info = {} context = icu.UserContext(info) callback = icu.URegexFindProgressCallback(progress_callback, context) matcher.set_find_progress_callback(callback) matcher.find(0) # → False # info: {'numCalls': 18, 'lastIndex': 18} info.clear() info["maxCalls"] = 5 matcher.find(0) # → ICUError: U_REGEX_STOPPED_BY_CALLER # info: {'maxCalls': 5, 'numCalls': 5, 'lastIndex': 5}
-
icu::number::SimpleNumberFormatter (ICU 73+)
from icupy import icu from icupy.icu import number fmt = number.SimpleNumberFormatter.for_locale_and_grouping_strategy("de-CH", icu.UNUM_GROUPING_ON_ALIGNED) fmtval = fmt.format_int64(1234567) fmtval.to_string() # → "1'234'567"
-
Subclassing icu::Transliterator
# Uppercase letters while skipping text enclosed in backticks from icupy import icu class TestTransliterator(icu.Transliterator): def __init__(self, filter_set: icu.UnicodeSet | None = None) -> None: icu.Transliterator.__init__(self, "Any-UpperWithoutCode", filter_set) def _handle_transliterate( self, text: icu.Replaceable, pos: icu.UTransPosition, incremental: bool, ) -> None: # Implement the transliteration algorithm here. cursor = pos.start in_backtick = False while cursor < pos.limit: c = text.char32_at(cursor) char_len = icu.u16_length(c) if c == 0x60: in_backtick = not in_backtick cursor += char_len continue if not in_backtick and icu.u_isalpha(c) and icu.u_islower(c): upper = icu.u_toupper(c) if upper != c: text.handle_replace_between(cursor, cursor + char_len, chr(upper)) char_len = icu.u16_length(upper) cursor += char_len pos.start = pos.limit tl = TestTransliterator() text = icu.UnicodeString("Subclasses must implement `_handle_transliterate()`, which defines their own transliteration algorithm.") tl.transliterate(text) # text: "SUBCLASSES MUST IMPLEMENT `_handle_transliterate()`, WHICH DEFINES THEIR OWN TRANSLITERATION ALGORITHM."
- Python >=3.10
- ICU4C (ICU - The International Components for Unicode) (>=70 recommended)
- C++17 compatible compiler (see Supported Compilers)
- CMake >=3.15
-
Windows:
Install the following dependencies:
- Python >=3.10
- Pre-built ICU4C binary package (>=70 recommended)
- C++17 compatible compiler. Visual Studio 2022 or newer recommended
- CMake >=3.15
- Note: Add CMake to the system PATH.
-
Linux:
To install dependencies, run the following command:
-
Ubuntu/Debian:
sudo apt install g++ cmake libicu-dev python3-dev python3-pip
-
Fedora:
sudo dnf install gcc-c++ cmake icu libicu-devel python3-devel
Note: If your system's ICU is out of date, consider building ICU4C from source or installing pre-built ICU4C binary package.
-
-
Configuring environment variables
-
Windows:
-
Set the
ICU_ROOTenvironment variable to the root of the ICU installation.For example, if the ICU is located in
C:\icu4c:in PowerShell:
$env:ICU_ROOT = "C:\icu4c"
or in Command Prompt:
set ICU_ROOT=C:\icu4c
-
To verify settings using
icuinfo(64-bit):in PowerShell:
& $env:ICU_ROOT\bin64\icuinfo
or in Command Prompt:
%ICU_ROOT%\bin64\icuinfo
-
-
Linux:
-
If the ICU is located in a non-regular place, set the
PKG_CONFIG_PATHandLD_LIBRARY_PATHenvironment variables.For example, if the ICU is located in
/usr/local:export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-
To verify settings using
pkg-config:pkg-config --cflags --libs icu-uc # -I/usr/local/include -L/usr/local/lib -licuuc
-
-
-
Installing from PyPI
pip install icupy
Optionally, CMake environment variables are available. For example, using the Ninja build system and Clang:
CMAKE_GENERATOR=Ninja CXX=clang++ pip install icupy
Alternatively, installing development version from the git repository:
pip install git+https://github.com/miute/icupy.git
-
Configuring environment variables
-
Windows:
-
Set the
ICU_ROOTenvironment variable to the root of the ICU installation (default isC:\icu).For example, if the ICU is located in
C:\icu4c:in PowerShell:
$env:ICU_ROOT = "C:\icu4c"
or in Command Prompt:
set ICU_ROOT=C:\icu4c
-
-
Linux:
-
If the ICU is located in a non-regular place, set the
LD_LIBRARY_PATHenvironment variables.For example, if the ICU is located in
/usr/local:export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-
-
-
Using icupy
import icupy.icu as icu # or from icupy import icu
This project is licensed under the MIT License.