From 270602a6d164c9d7e9339f21a003dfd70d50bfbe Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Tue, 26 May 2026 09:59:57 +0000 Subject: [PATCH 1/4] Fix partial TSV matching, added support for chemical formulas, and relative file paths Signed-off-by: Shreyas Pawar --- Jenkinsfile | 2 +- .../hi/data/electronic/chemical_names.tsv | 37 ++++ .../hi/taggers/electronic.py | 74 +++++--- .../hi/verbalizers/electronic.py | 169 +++++++++--------- .../test_cases_electronic.txt | 7 +- 5 files changed, 183 insertions(+), 106 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv diff --git a/Jenkinsfile b/Jenkinsfile index 24ac047eb..63986e94e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-26-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv b/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv new file mode 100644 index 000000000..3dbecc3bf --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv @@ -0,0 +1,37 @@ +H2O जल +H₂O जल +CO2 कार्बन डाइऑक्साइड +CO₂ कार्बन डाइऑक्साइड +O2 ऑक्सीजन +O₂ ऑक्सीजन +N2 नाइट्रोजन +N₂ नाइट्रोजन +NaCl सोडियम क्लोराइड +HCl हाइड्रोक्लोरिक एसिड +H2SO4 सल्फ्यूरिक एसिड +H₂SO₄ सल्फ्यूरिक एसिड +HNO3 नाइट्रिक एसिड +HNO₃ नाइट्रिक एसिड +NH3 अमोनिया +NH₃ अमोनिया +CH4 मीथेन +CH₄ मीथेन +NaOH सोडियम हाइड्रॉक्साइड +KOH पोटेशियम हाइड्रॉक्साइड +Ca(OH)2 कैल्शियम हाइड्रॉक्साइड +Ca(OH)₂ कैल्शियम हाइड्रॉक्साइड +CaCO3 कैल्शियम कार्बोनेट +CaCO₃ कैल्शियम कार्बोनेट +C6H12O6 ग्लूकोज़ +C₆H₁₂O₆ ग्लूकोज़ +NaHCO3 सोडियम बाइकार्बोनेट +NaHCO₃ सोडियम बाइकार्बोनेट +Na2CO3 सोडियम कार्बोनेट +Na₂CO₃ सोडियम कार्बोनेट +CH₃COO– एसीटेट आयन +CH3COO- एसीटेट आयन +CH₃COO⁻ एसीटेट आयन +Ba(OH)2 बेरियम हाइड्रॉक्साइड +Ba(OH)₂ बेरियम हाइड्रॉक्साइड +Al2(SO4)3 एल्युमिनियम सल्फेट +Al₂(SO₄)₃ एल्युमिनियम सल्फेट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/electronic.py b/nemo_text_processing/text_normalization/hi/taggers/electronic.py index 7807117e6..042463d90 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/taggers/electronic.py @@ -26,7 +26,7 @@ class ElectronicFst(GraphFst): e.g. kumar@gmail.com -> tokens { electronic { username: "kumar" domain: "gmail.com" } } e.g. https://google.com/ -> tokens { electronic { protocol: "https" domain: "google.com/" } } e.g. C:\\Users\\HP\\Desktop -> tokens { electronic { path: "C:\\Users\\HP\\Desktop" } } - e.g. 192.168.1.1 -> tokens { electronic { ip: "192.168.1.1" } } + e.g. 192.168.1.1 -> tokens { electronic { domain: "192.168.1.1" } } """ @@ -39,7 +39,6 @@ def __init__(self, deterministic: bool = True): alphanumeric = NEMO_ALPHA | NEMO_DIGIT | NEMO_HI_DIGIT | subscript_digit - # email username_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep(".") | pynini.accep("-") | pynini.accep("_") username = pynutil.insert("username: \"") + pynini.closure(username_chars, 1) + pynutil.insert("\"") @@ -48,7 +47,6 @@ def __init__(self, deterministic: bool = True): email_graph = username + pynini.cross("@", "") + domain - # url: protocol handling for https://, http://, www., and combined forms protocol_start = pynini.cross("https://", "https") | pynini.cross("http://", "http") protocol_end = pynini.cross("www.", "www") protocol = ( @@ -80,7 +78,6 @@ def __init__(self, deterministic: bool = True): url_graph = protocol + url_domain - # file paths: Windows (C:\...), Unix (/...), and backslash-prefixed (\...) drive_letter = NEMO_ALPHA windows_path_chars = alphanumeric | pynini.union( pynini.accep("\\"), @@ -107,8 +104,23 @@ def __init__(self, deterministic: bool = True): pynini.accep("_"), pynini.accep("$"), ) + + unix_segment_chars = alphanumeric | pynini.union( + pynini.accep("."), + pynini.accep("-"), + pynini.accep("_"), + pynini.accep("$"), + ) + unix_segment = pynini.closure(unix_segment_chars, 1) + + abs_unix_path = pynini.accep("/") + pynini.closure(unix_path_chars, 1) + + rel_unix_path = unix_segment + pynini.accep("/") + pynini.closure(unix_path_chars, 0) + unix_path = ( - pynutil.insert("path: \"") + pynini.accep("/") + pynini.closure(unix_path_chars, 1) + pynutil.insert("\"") + pynutil.insert("path: \"") + + (abs_unix_path | rel_unix_path) + + pynutil.insert("\"") ) backslash_path_chars = alphanumeric | pynini.union( @@ -125,12 +137,10 @@ def __init__(self, deterministic: bool = True): + pynutil.insert("\"") ) - # ip addresses: exactly 4 dot-separated octets ip_octet = pynini.closure(NEMO_DIGIT, 1, 3) dot_octet = pynini.accep(".") + ip_octet ip_address = pynutil.insert("domain: \"") + ip_octet + pynini.closure(dot_octet, 3, 3) + pynutil.insert("\"") - # domains: simple TLD-based (abc.com) and government/education suffixes (.gov.in, .ac.in) domain_segment_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-") domain_segment = pynini.closure(domain_segment_chars, 1) @@ -144,7 +154,6 @@ def __init__(self, deterministic: bool = True): pynutil.insert("domain: \"") + domain_body + pynini.closure(pynini.accep("/"), 0, 1) + pynutil.insert("\"") ) - # file extensions: e.g. report.pdf, data.csv known_extensions = pynini.project( pynini.string_file(get_abs_path("data/electronic/file_extensions.tsv")), "input" ) @@ -154,27 +163,50 @@ def __init__(self, deterministic: bool = True): pynutil.insert("domain: \"") + filename_stem + pynini.accep(".") + known_extensions + pynutil.insert("\"") ) - # chemical formulas with subscript digits: e.g. H₂O, CO₂ - chemical_chars = NEMO_ALPHA | subscript_digit + chemical_chars = ( + NEMO_ALPHA + | NEMO_DIGIT + | subscript_digit + | pynini.accep("(") + | pynini.accep(")") + | pynini.accep("+") + | pynini.accep("-") + | pynini.accep("–") + ) + + raw_chemical = NEMO_ALPHA + pynini.closure(chemical_chars, 1) + + any_chem = pynini.closure(chemical_chars) + has_open = any_chem + pynini.accep("(") + any_chem + no_open = pynini.difference(any_chem, has_open) + ends_with_close = any_chem + pynini.accep(")") + + unbalanced_trailing = pynini.intersect(no_open, ends_with_close) + + valid_chemical = pynini.difference(raw_chemical, unbalanced_trailing).optimize() + chemical_formula = ( - pynutil.insert("domain: \"") + NEMO_ALPHA + pynini.closure(chemical_chars, 1) + pynutil.insert("\"") + pynutil.insert("domain: \"") + + valid_chemical + + pynutil.insert("\"") ) - - # alphanumeric codes: strings containing both letters and digits, - # optionally separated by hyphens, e.g. IELF004, N95, GSAT-18, F-35B + alnum_seg = pynini.closure(NEMO_ALPHA | NEMO_DIGIT, 1) - alphanumeric_pattern = alnum_seg + pynini.closure(pynini.accep("-") + alnum_seg) - - alnum_hyp_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-")) - contains_alpha = alnum_hyp_sigma + NEMO_ALPHA + alnum_hyp_sigma - contains_digit = alnum_hyp_sigma + NEMO_DIGIT + alnum_hyp_sigma + + separator = pynini.accep("-") | pynini.accep(".") + alphanumeric_pattern = alnum_seg + pynini.closure(separator + alnum_seg) + + alnum_hyp_dot_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-") | pynini.accep(".")) + + contains_alpha = alnum_hyp_dot_sigma + NEMO_ALPHA + alnum_hyp_dot_sigma + contains_digit = alnum_hyp_dot_sigma + NEMO_DIGIT + alnum_hyp_dot_sigma + alphanumeric_code_fst = pynini.intersect( pynini.intersect(alphanumeric_pattern, contains_alpha), contains_digit ).optimize() alphanumeric_code = pynutil.insert("domain: \"") + alphanumeric_code_fst + pynutil.insert("\"") - # Weights use 3 tiers: structurally unambiguous (1.0), moderately general (1.1), greedy (1.2) graph = ( pynutil.add_weight(url_graph, 1.0) | pynutil.add_weight(email_graph, 1.0) @@ -189,4 +221,4 @@ def __init__(self, deterministic: bool = True): ) self.graph = graph.optimize() - self.fst = self.add_tokens(graph).optimize() + self.fst = self.add_tokens(graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py index 124e1c60b..8bfa80e2b 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py @@ -30,10 +30,10 @@ class ElectronicFst(GraphFst): Uses a phonetic-first approach with letter-by-letter fallback. Examples: - electronic { username: "kumar" domain: "gmail.com" } -> "कुमार एट जीमेल डॉट कॉम" + electronic { username: "kumar" domain: "gmail.com" } -> "के यू एम ए आर एट जीमेल डॉट कॉम" electronic { protocol: "https" domain: "google.com/" } -> "एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश गूगल डॉट कॉम फॉरवर्ड स्लैश" - electronic { path: "C:\\Users\\HP" } -> "सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी" - electronic { ip: "192.168.1.1" } -> "एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक" + electronic { path: "C:\\Users\\HP\\Desktop" } -> "सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी बैकवर्ड स्लैश डेस्कटॉप" + electronic { domain: "192.168.1.1" } -> "एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक" Args: deterministic: if True will provide a single transduction option, @@ -43,109 +43,112 @@ class ElectronicFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) - # Load data files - symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() - domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() - server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() - common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() - latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) - latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() - - # Digit mappings - use telephone number mappings for ASCII digits - ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() - hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() - hindi_zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() + symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() + domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() + server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() + chemical_graph = pynini.string_file(get_abs_path("data/electronic/chemical_names.tsv")).optimize() + common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() + latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) + latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() + + ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() + hindi_zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() subscript_digit_graph = pynini.string_file(get_abs_path("data/electronic/subscript_digit.tsv")).optimize() - digit_verbalization = ascii_digit_graph | hindi_digit_graph | hindi_zero_graph | subscript_digit_graph - - # Combined phonetic word graph: server names + common words - phonetic_word = server_name_graph | common_words_graph + digit_verbalization = ( + ascii_digit_graph | hindi_digit_graph | hindi_zero_graph | subscript_digit_graph + ) - # ============ CHARACTER VERBALIZATION ============ - # Single character to Hindi verbalization with space insertion - char_to_hindi = pynutil.add_weight(latin_to_hindi_graph, 1.0) | pynutil.add_weight( # Letter mapping - digit_verbalization, 1.0 - ) # Digit mapping - char_with_space = char_to_hindi + insert_space + protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize() - # ============ SYMBOL VERBALIZATION ============ - symbol_to_hindi = symbols_graph + insert_space + single_letter = latin_to_hindi_graph + insert_space + single_digit = digit_verbalization + insert_space + single_symbol = symbols_graph + insert_space - # ============ DOMAIN VERBALIZATION ============ - # Domain extension verbalization (.com -> डॉट कॉम) - domain_ext_verbalization = pynini.cross(".", "डॉट ") + domain_graph + insert_space + single_non_alpha = ( + pynutil.add_weight(single_symbol, 1.0) + | pynutil.add_weight(single_digit, 1.0) + ) - # ============ PROTOCOL VERBALIZATION ============ - protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize() - protocol_verbalization = protocol_graph + insert_space + def make_alpha_run_verbalizer(tsv_graphs): + phonetic = pynini.union( + *[pynutil.add_weight(g + insert_space, w) for g, w in tsv_graphs] + ) + literal = pynutil.add_weight(pynini.closure(single_letter, 1), 1.1) + return phonetic | literal + + def make_content(alpha_run_verb, non_alpha_sep=None): + if non_alpha_sep is None: + non_alpha_sep = single_non_alpha + mandatory_sep = pynini.closure(non_alpha_sep, 1) + return ( + pynini.closure(non_alpha_sep, 0) + + pynini.closure(alpha_run_verb + mandatory_sep, 0) + + pynini.closure(alpha_run_verb, 0, 1) + + pynini.closure(non_alpha_sep, 0) + ) - # ============ FIELD EXTRACTION ============ - # Extract username field delete_username_tag = pynutil.delete("username: \"") - delete_domain_tag = pynutil.delete("domain: \"") + delete_domain_tag = pynutil.delete("domain: \"") delete_protocol_tag = pynutil.delete("protocol: \"") - delete_path_tag = pynutil.delete("path: \"") - delete_quote = pynutil.delete("\"") - - # Username verbalization: letter-by-letter with symbol handling - username_content = pynini.closure( - pynutil.add_weight(phonetic_word + insert_space, 0.9) - | pynutil.add_weight(symbol_to_hindi, 1.0) - | pynutil.add_weight(char_with_space, 1.1), - 1, - ) - + delete_path_tag = pynutil.delete("path: \"") + delete_quote = pynutil.delete("\"") + + username_alpha_run = make_alpha_run_verbalizer([ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ]) + username_content = make_content(username_alpha_run) username_graph = ( - delete_username_tag + username_content + delete_quote + delete_space + pynutil.insert("एट ") # @ symbol + delete_username_tag + + username_content + + delete_quote + + delete_space + + pynutil.insert("एट ") ) - # Domain verbalization - domain_content = pynini.closure( - pynutil.add_weight(phonetic_word + insert_space, 0.9) - | pynutil.add_weight(domain_ext_verbalization, 0.95) - | pynutil.add_weight(symbol_to_hindi, 1.0) - | pynutil.add_weight(char_with_space, 1.1), - 1, + domain_alpha_run = make_alpha_run_verbalizer([ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ]) + + domain_content = ( + pynutil.add_weight(chemical_graph + insert_space, 0.8) + | pynutil.add_weight(make_content(domain_alpha_run), 1.0) ) - + domain_only_graph = delete_domain_tag + domain_content + delete_quote - # Protocol verbalization - protocol_only_graph = delete_protocol_tag + protocol_verbalization + delete_quote + delete_space - - # Path verbalization (Windows/Unix file paths) - path_content = pynini.closure( - pynutil.add_weight(common_words_graph + insert_space, 0.9) - | pynutil.add_weight(symbol_to_hindi, 1.0) - | pynutil.add_weight(char_with_space, 1.1), - 1, + protocol_only_graph = ( + delete_protocol_tag + + protocol_graph + insert_space + + delete_quote + + delete_space ) - path_graph = delete_path_tag + path_content + delete_quote + path_alpha_run = make_alpha_run_verbalizer([ + (domain_graph, 0.87), + (common_words_graph, 0.90), + ]) + path_content = make_content(path_alpha_run) + path_graph = delete_path_tag + path_content + delete_quote - # IP address verbalization (digit by digit) - ip_char = pynutil.add_weight(symbols_graph + insert_space, 1.0) | pynutil.add_weight( - digit_verbalization + insert_space, 1.0 - ) + ip_char = single_symbol | single_digit ip_content = pynini.closure(ip_char, 1) + ip_graph = delete_domain_tag + ip_content + delete_quote - ip_graph = delete_domain_tag + ip_content + delete_quote - - # ============ COMBINED GRAPH ============ - # Email: username + domain email_full = username_graph + domain_only_graph + url_full = protocol_only_graph + domain_only_graph - # URL with protocol: protocol + domain - url_full = protocol_only_graph + domain_only_graph - - # Combined final graph graph = ( - pynutil.add_weight(url_full, 1.0) - | pynutil.add_weight(email_full, 1.01) - | pynutil.add_weight(path_graph, 1.02) - | pynutil.add_weight(ip_graph, 1.03) + pynutil.add_weight(url_full, 1.0) + | pynutil.add_weight(email_full, 1.01) + | pynutil.add_weight(path_graph, 1.02) + | pynutil.add_weight(ip_graph, 1.03) | pynutil.add_weight(domain_only_graph, 1.04) ) delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt index 85b34c4a3..580a3de15 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt @@ -57,4 +57,9 @@ ip address 192.168.1.1~आई पी एड्रेस एक नौ दो ड ip address 10.0.0.1~आई पी एड्रेस एक शून्य डॉट शून्य डॉट शून्य डॉट एक report.pdf~आर ई पी ओ आर टी डॉट पी डी एफ photo.jpg~पी एच ओ टी ओ डॉट जे पी जी -data.csv~डेटा डॉट सी एस वी \ No newline at end of file +data.csv~डेटा डॉट सी एस वी +robinson.org~आर ओ बी आई एन एस ओ एन डॉट ऑर्ग +anand@gmail.com~ए एन ए एन डी एट जीमेल डॉट कॉम +Al₂(SO₄)₃~एल्युमिनियम सल्फेट +C₂H₄~सी दो एच चार +home/desktop~होम फॉरवर्ड स्लैश डेस्कटॉप \ No newline at end of file From 384ebd797e52d6cc60a8c068aa79e8e8f7c0c1aa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 May 2026 10:39:05 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/electronic.py | 48 +++---- .../hi/verbalizers/electronic.py | 121 ++++++++---------- 2 files changed, 74 insertions(+), 95 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/electronic.py b/nemo_text_processing/text_normalization/hi/taggers/electronic.py index 042463d90..980a7e604 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/taggers/electronic.py @@ -104,7 +104,7 @@ def __init__(self, deterministic: bool = True): pynini.accep("_"), pynini.accep("$"), ) - + unix_segment_chars = alphanumeric | pynini.union( pynini.accep("."), pynini.accep("-"), @@ -114,14 +114,10 @@ def __init__(self, deterministic: bool = True): unix_segment = pynini.closure(unix_segment_chars, 1) abs_unix_path = pynini.accep("/") + pynini.closure(unix_path_chars, 1) - + rel_unix_path = unix_segment + pynini.accep("/") + pynini.closure(unix_path_chars, 0) - - unix_path = ( - pynutil.insert("path: \"") - + (abs_unix_path | rel_unix_path) - + pynutil.insert("\"") - ) + + unix_path = pynutil.insert("path: \"") + (abs_unix_path | rel_unix_path) + pynutil.insert("\"") backslash_path_chars = alphanumeric | pynini.union( pynini.accep("\\"), @@ -164,43 +160,39 @@ def __init__(self, deterministic: bool = True): ) chemical_chars = ( - NEMO_ALPHA - | NEMO_DIGIT - | subscript_digit - | pynini.accep("(") + NEMO_ALPHA + | NEMO_DIGIT + | subscript_digit + | pynini.accep("(") | pynini.accep(")") | pynini.accep("+") | pynini.accep("-") - | pynini.accep("–") + | pynini.accep("–") ) - + raw_chemical = NEMO_ALPHA + pynini.closure(chemical_chars, 1) - + any_chem = pynini.closure(chemical_chars) has_open = any_chem + pynini.accep("(") + any_chem no_open = pynini.difference(any_chem, has_open) ends_with_close = any_chem + pynini.accep(")") - + unbalanced_trailing = pynini.intersect(no_open, ends_with_close) - + valid_chemical = pynini.difference(raw_chemical, unbalanced_trailing).optimize() - - chemical_formula = ( - pynutil.insert("domain: \"") - + valid_chemical - + pynutil.insert("\"") - ) - + + chemical_formula = pynutil.insert("domain: \"") + valid_chemical + pynutil.insert("\"") + alnum_seg = pynini.closure(NEMO_ALPHA | NEMO_DIGIT, 1) - + separator = pynini.accep("-") | pynini.accep(".") alphanumeric_pattern = alnum_seg + pynini.closure(separator + alnum_seg) alnum_hyp_dot_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-") | pynini.accep(".")) - + contains_alpha = alnum_hyp_dot_sigma + NEMO_ALPHA + alnum_hyp_dot_sigma contains_digit = alnum_hyp_dot_sigma + NEMO_DIGIT + alnum_hyp_dot_sigma - + alphanumeric_code_fst = pynini.intersect( pynini.intersect(alphanumeric_pattern, contains_alpha), contains_digit ).optimize() @@ -221,4 +213,4 @@ def __init__(self, deterministic: bool = True): ) self.graph = graph.optimize() - self.fst = self.add_tokens(graph).optimize() \ No newline at end of file + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py index 8bfa80e2b..e1b387a2e 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py @@ -43,37 +43,30 @@ class ElectronicFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) - symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() - domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() - server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() - chemical_graph = pynini.string_file(get_abs_path("data/electronic/chemical_names.tsv")).optimize() - common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() - latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) - latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() - - ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() - hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() - hindi_zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() + symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() + domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() + server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() + chemical_graph = pynini.string_file(get_abs_path("data/electronic/chemical_names.tsv")).optimize() + common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() + latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) + latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() + + ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() + hindi_zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() subscript_digit_graph = pynini.string_file(get_abs_path("data/electronic/subscript_digit.tsv")).optimize() - digit_verbalization = ( - ascii_digit_graph | hindi_digit_graph | hindi_zero_graph | subscript_digit_graph - ) + digit_verbalization = ascii_digit_graph | hindi_digit_graph | hindi_zero_graph | subscript_digit_graph protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize() single_letter = latin_to_hindi_graph + insert_space - single_digit = digit_verbalization + insert_space - single_symbol = symbols_graph + insert_space + single_digit = digit_verbalization + insert_space + single_symbol = symbols_graph + insert_space - single_non_alpha = ( - pynutil.add_weight(single_symbol, 1.0) - | pynutil.add_weight(single_digit, 1.0) - ) + single_non_alpha = pynutil.add_weight(single_symbol, 1.0) | pynutil.add_weight(single_digit, 1.0) def make_alpha_run_verbalizer(tsv_graphs): - phonetic = pynini.union( - *[pynutil.add_weight(g + insert_space, w) for g, w in tsv_graphs] - ) + phonetic = pynini.union(*[pynutil.add_weight(g + insert_space, w) for g, w in tsv_graphs]) literal = pynutil.add_weight(pynini.closure(single_letter, 1), 1.1) return phonetic | literal @@ -89,66 +82,60 @@ def make_content(alpha_run_verb, non_alpha_sep=None): ) delete_username_tag = pynutil.delete("username: \"") - delete_domain_tag = pynutil.delete("domain: \"") + delete_domain_tag = pynutil.delete("domain: \"") delete_protocol_tag = pynutil.delete("protocol: \"") - delete_path_tag = pynutil.delete("path: \"") - delete_quote = pynutil.delete("\"") - - username_alpha_run = make_alpha_run_verbalizer([ - (server_name_graph, 0.85), - (domain_graph, 0.87), - (common_words_graph, 0.90), - ]) + delete_path_tag = pynutil.delete("path: \"") + delete_quote = pynutil.delete("\"") + + username_alpha_run = make_alpha_run_verbalizer( + [ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] + ) username_content = make_content(username_alpha_run) - username_graph = ( - delete_username_tag - + username_content - + delete_quote - + delete_space - + pynutil.insert("एट ") + username_graph = delete_username_tag + username_content + delete_quote + delete_space + pynutil.insert("एट ") + + domain_alpha_run = make_alpha_run_verbalizer( + [ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] ) - domain_alpha_run = make_alpha_run_verbalizer([ - (server_name_graph, 0.85), - (domain_graph, 0.87), - (common_words_graph, 0.90), - ]) - - domain_content = ( - pynutil.add_weight(chemical_graph + insert_space, 0.8) - | pynutil.add_weight(make_content(domain_alpha_run), 1.0) + domain_content = pynutil.add_weight(chemical_graph + insert_space, 0.8) | pynutil.add_weight( + make_content(domain_alpha_run), 1.0 ) - + domain_only_graph = delete_domain_tag + domain_content + delete_quote - protocol_only_graph = ( - delete_protocol_tag - + protocol_graph + insert_space - + delete_quote - + delete_space - ) + protocol_only_graph = delete_protocol_tag + protocol_graph + insert_space + delete_quote + delete_space - path_alpha_run = make_alpha_run_verbalizer([ - (domain_graph, 0.87), - (common_words_graph, 0.90), - ]) + path_alpha_run = make_alpha_run_verbalizer( + [ + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] + ) path_content = make_content(path_alpha_run) - path_graph = delete_path_tag + path_content + delete_quote + path_graph = delete_path_tag + path_content + delete_quote - ip_char = single_symbol | single_digit + ip_char = single_symbol | single_digit ip_content = pynini.closure(ip_char, 1) - ip_graph = delete_domain_tag + ip_content + delete_quote + ip_graph = delete_domain_tag + ip_content + delete_quote email_full = username_graph + domain_only_graph - url_full = protocol_only_graph + domain_only_graph + url_full = protocol_only_graph + domain_only_graph graph = ( - pynutil.add_weight(url_full, 1.0) - | pynutil.add_weight(email_full, 1.01) - | pynutil.add_weight(path_graph, 1.02) - | pynutil.add_weight(ip_graph, 1.03) + pynutil.add_weight(url_full, 1.0) + | pynutil.add_weight(email_full, 1.01) + | pynutil.add_weight(path_graph, 1.02) + | pynutil.add_weight(ip_graph, 1.03) | pynutil.add_weight(domain_only_graph, 1.04) ) delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() From 27cac481e078eb7adb7321421c4e332b6e5e2d38 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Tue, 2 Jun 2026 07:40:32 +0000 Subject: [PATCH 3/4] Refactor electronic class with dynamic symbol matrix and chemical rules. Replaced hardcoded TSVs and Python symbol rules with a data-driven symbol_classes.tsv and dynamic elements.tsv. Fixed greedy over-tagging of standard English words and added Unicode support for complex chemical ions without breaking URL hyphen logic. Signed-off-by: Shreyas Pawar --- Jenkinsfile | 2 +- .../hi/data/electronic/chemical_names.tsv | 37 ----- .../hi/data/electronic/elements.tsv | 132 ++++++++++++++++++ .../hi/data/electronic/symbol_classes.tsv | 16 +++ .../hi/data/electronic/symbols.tsv | 5 +- .../hi/taggers/electronic.py | 105 ++++++-------- .../hi/verbalizers/electronic.py | 27 ++-- .../test_cases_electronic.txt | 6 +- 8 files changed, 211 insertions(+), 119 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/electronic/elements.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/electronic/symbol_classes.tsv diff --git a/Jenkinsfile b/Jenkinsfile index 63986e94e..204ae8c60 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-26-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-02-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv b/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv deleted file mode 100644 index 3dbecc3bf..000000000 --- a/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv +++ /dev/null @@ -1,37 +0,0 @@ -H2O जल -H₂O जल -CO2 कार्बन डाइऑक्साइड -CO₂ कार्बन डाइऑक्साइड -O2 ऑक्सीजन -O₂ ऑक्सीजन -N2 नाइट्रोजन -N₂ नाइट्रोजन -NaCl सोडियम क्लोराइड -HCl हाइड्रोक्लोरिक एसिड -H2SO4 सल्फ्यूरिक एसिड -H₂SO₄ सल्फ्यूरिक एसिड -HNO3 नाइट्रिक एसिड -HNO₃ नाइट्रिक एसिड -NH3 अमोनिया -NH₃ अमोनिया -CH4 मीथेन -CH₄ मीथेन -NaOH सोडियम हाइड्रॉक्साइड -KOH पोटेशियम हाइड्रॉक्साइड -Ca(OH)2 कैल्शियम हाइड्रॉक्साइड -Ca(OH)₂ कैल्शियम हाइड्रॉक्साइड -CaCO3 कैल्शियम कार्बोनेट -CaCO₃ कैल्शियम कार्बोनेट -C6H12O6 ग्लूकोज़ -C₆H₁₂O₆ ग्लूकोज़ -NaHCO3 सोडियम बाइकार्बोनेट -NaHCO₃ सोडियम बाइकार्बोनेट -Na2CO3 सोडियम कार्बोनेट -Na₂CO₃ सोडियम कार्बोनेट -CH₃COO– एसीटेट आयन -CH3COO- एसीटेट आयन -CH₃COO⁻ एसीटेट आयन -Ba(OH)2 बेरियम हाइड्रॉक्साइड -Ba(OH)₂ बेरियम हाइड्रॉक्साइड -Al2(SO4)3 एल्युमिनियम सल्फेट -Al₂(SO₄)₃ एल्युमिनियम सल्फेट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/electronic/elements.tsv b/nemo_text_processing/text_normalization/hi/data/electronic/elements.tsv new file mode 100644 index 000000000..be4610634 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/electronic/elements.tsv @@ -0,0 +1,132 @@ +Ac +Ag +Al +Am +An +Ar +As +At +Au +B +Ba +Be +Bh +Bi +Bk +Br +Bu +Bz +C +Ca +Cd +Ce +Cf +Cl +Cm +Cn +Co +Cp +Cr +Cs +Cu +D +Db +Ds +Dy +En +Er +Es +Et +Eu +F +Fe +Fl +Fm +Fr +Ga +Gd +Ge +H +He +Hf +Hg +Ho +Hs +I +In +Ir +K +Kr +La +Li +Ln +Lr +Lu +Lv +M +Mc +Md +Me +Mg +Mn +Mo +Mt +N +Na +Nb +Nd +Ne +Nh +Ni +No +Np +O +Og +Os +P +Pa +Pb +Pd +Ph +Pm +Po +Pr +Pt +Pu +R +Ra +Rb +Re +Rf +Rg +Rh +Rn +Ru +S +Sb +Sc +Se +Sg +Si +Sm +Sn +Sr +T +Ta +Tb +Tc +Te +Th +Ti +Tl +Tm +Ts +U +V +W +X +Xe +Y +Yb +Zn +Zr \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/electronic/symbol_classes.tsv b/nemo_text_processing/text_normalization/hi/data/electronic/symbol_classes.tsv new file mode 100644 index 000000000..cf17c8756 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/electronic/symbol_classes.tsv @@ -0,0 +1,16 @@ +. email,url,unix,windows +- email,url,unix,windows,chem +_ email,url,unix,windows +/ url,unix +$ unix +\ windows +( windows,chem +) windows,chem ++ url,chem +– chem +# url +? url +& url += url +% url +: url,windows \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/electronic/symbols.tsv b/nemo_text_processing/text_normalization/hi/data/electronic/symbols.tsv index fe8881ae8..e720f5338 100644 --- a/nemo_text_processing/text_normalization/hi/data/electronic/symbols.tsv +++ b/nemo_text_processing/text_normalization/hi/data/electronic/symbols.tsv @@ -30,4 +30,7 @@ $ डॉलर \[ ओपन स्क्वेर ब्रेकेट \] क्लोज़ स्क्वेर ब्रेकेट { ओपन कर्ली ब्रेकेट -} क्लोज़ कर्ली ब्रेकेट \ No newline at end of file +} क्लोज़ कर्ली ब्रेकेट +– माइनस +⁻ माइनस +⁺ प्लस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/electronic.py b/nemo_text_processing/text_normalization/hi/taggers/electronic.py index 980a7e604..69422084c 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/taggers/electronic.py @@ -36,10 +36,31 @@ def __init__(self, deterministic: bool = True): subscript_digit = pynini.project( pynini.string_file(get_abs_path("data/electronic/subscript_digit.tsv")), "input" ) - alphanumeric = NEMO_ALPHA | NEMO_DIGIT | NEMO_HI_DIGIT | subscript_digit - username_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep(".") | pynini.accep("-") | pynini.accep("_") + symbol_dict = {"email": [], "url": [], "unix": [], "windows": [], "chem": []} + + with open(get_abs_path("data/electronic/symbol_classes.tsv"), "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + parts = line.strip().split("\t") + if len(parts) == 2: + sym = parts[0] + classes = parts[1].split(",") + for c in classes: + if c in symbol_dict: + symbol_dict[c].append(sym) + + email_symbols = pynini.union(*symbol_dict["email"]) + url_symbols = pynini.union(*symbol_dict["url"]) + unix_symbols = pynini.union(*symbol_dict["unix"]) + win_symbols = pynini.union(*symbol_dict["windows"]) + chemical_symbols = pynini.union(*symbol_dict["chem"]) + + unix_segment_syms = pynini.union(*[s for s in symbol_dict["unix"] if s != "/"]) + + username_chars = NEMO_ALPHA | NEMO_DIGIT | email_symbols username = pynutil.insert("username: \"") + pynini.closure(username_chars, 1) + pynutil.insert("\"") domain_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep(".") | pynini.accep("-") @@ -59,35 +80,13 @@ def __init__(self, deterministic: bool = True): + pynutil.insert("\"") ) - url_path_chars = alphanumeric | pynini.union( - pynini.accep("."), - pynini.accep("-"), - pynini.accep("_"), - pynini.accep("/"), - pynini.accep("#"), - pynini.accep("?"), - pynini.accep("&"), - pynini.accep("="), - pynini.accep("%"), - pynini.accep("+"), - pynini.accep(":"), - ) + url_path_chars = alphanumeric | url_symbols url_path = pynini.closure(url_path_chars, 1) - url_domain = pynutil.insert(" domain: \"") + url_path + pynutil.insert("\"") - url_graph = protocol + url_domain drive_letter = NEMO_ALPHA - windows_path_chars = alphanumeric | pynini.union( - pynini.accep("\\"), - pynini.accep("."), - pynini.accep("-"), - pynini.accep("_"), - pynini.accep(" "), - pynini.accep("("), - pynini.accep(")"), - ) + windows_path_chars = alphanumeric | win_symbols | pynini.accep(" ") windows_path = ( pynutil.insert("path: \"") + drive_letter @@ -97,35 +96,16 @@ def __init__(self, deterministic: bool = True): + pynutil.insert("\"") ) - unix_path_chars = alphanumeric | pynini.union( - pynini.accep("/"), - pynini.accep("."), - pynini.accep("-"), - pynini.accep("_"), - pynini.accep("$"), - ) - - unix_segment_chars = alphanumeric | pynini.union( - pynini.accep("."), - pynini.accep("-"), - pynini.accep("_"), - pynini.accep("$"), - ) + unix_path_chars = alphanumeric | unix_symbols + unix_segment_chars = alphanumeric | unix_segment_syms unix_segment = pynini.closure(unix_segment_chars, 1) abs_unix_path = pynini.accep("/") + pynini.closure(unix_path_chars, 1) - rel_unix_path = unix_segment + pynini.accep("/") + pynini.closure(unix_path_chars, 0) unix_path = pynutil.insert("path: \"") + (abs_unix_path | rel_unix_path) + pynutil.insert("\"") - backslash_path_chars = alphanumeric | pynini.union( - pynini.accep("\\"), - pynini.accep("."), - pynini.accep("-"), - pynini.accep("_"), - pynini.accep(" "), - ) + backslash_path_chars = alphanumeric | unix_segment_syms | pynini.accep("\\") | pynini.accep(" ") backslash_path = ( pynutil.insert("path: \"") + pynini.accep("\\") @@ -159,32 +139,27 @@ def __init__(self, deterministic: bool = True): pynutil.insert("domain: \"") + filename_stem + pynini.accep(".") + known_extensions + pynutil.insert("\"") ) - chemical_chars = ( - NEMO_ALPHA - | NEMO_DIGIT - | subscript_digit - | pynini.accep("(") - | pynini.accep(")") - | pynini.accep("+") - | pynini.accep("-") - | pynini.accep("–") - ) - - raw_chemical = NEMO_ALPHA + pynini.closure(chemical_chars, 1) - - any_chem = pynini.closure(chemical_chars) + elements = pynini.project(pynini.string_file(get_abs_path("data/electronic/elements.tsv")), "input") + + chem_number = pynini.closure(NEMO_DIGIT | subscript_digit, 1) + + chem_block = elements + pynini.closure(chem_number, 0, 1) + + chem_sequence_chars = chem_block | chemical_symbols | chem_number + + raw_chemical = pynini.closure(chemical_symbols) + chem_block + pynini.closure(chem_sequence_chars) + + any_chem = pynini.closure(chem_sequence_chars) has_open = any_chem + pynini.accep("(") + any_chem no_open = pynini.difference(any_chem, has_open) ends_with_close = any_chem + pynini.accep(")") unbalanced_trailing = pynini.intersect(no_open, ends_with_close) - valid_chemical = pynini.difference(raw_chemical, unbalanced_trailing).optimize() chemical_formula = pynutil.insert("domain: \"") + valid_chemical + pynutil.insert("\"") alnum_seg = pynini.closure(NEMO_ALPHA | NEMO_DIGIT, 1) - separator = pynini.accep("-") | pynini.accep(".") alphanumeric_pattern = alnum_seg + pynini.closure(separator + alnum_seg) @@ -213,4 +188,4 @@ def __init__(self, deterministic: bool = True): ) self.graph = graph.optimize() - self.fst = self.add_tokens(graph).optimize() + self.fst = self.add_tokens(graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py index e1b387a2e..60381044d 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py @@ -43,13 +43,12 @@ class ElectronicFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) - symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() - domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() - server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() - chemical_graph = pynini.string_file(get_abs_path("data/electronic/chemical_names.tsv")).optimize() - common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() - latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) - latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() + symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() + domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() + server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() + common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() + latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) + latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() @@ -105,10 +104,14 @@ def make_content(alpha_run_verb, non_alpha_sep=None): ] ) - domain_content = pynutil.add_weight(chemical_graph + insert_space, 0.8) | pynutil.add_weight( - make_content(domain_alpha_run), 1.0 - ) - + domain_alpha_run = make_alpha_run_verbalizer([ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ]) + + domain_content = pynutil.add_weight(make_content(domain_alpha_run), 1.0) + domain_only_graph = delete_domain_tag + domain_content + delete_quote protocol_only_graph = delete_protocol_tag + protocol_graph + insert_space + delete_quote + delete_space @@ -138,4 +141,4 @@ def make_content(alpha_run_verb, non_alpha_sep=None): ) delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt index 580a3de15..f9a175794 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt @@ -53,13 +53,13 @@ C:\Users\HP\Documents\Zoom~सी कोलन बैकवर्ड स्ल 255.255.255.0~दो पाँच पाँच डॉट दो पाँच पाँच डॉट दो पाँच पाँच डॉट शून्य आईपी पता है 192.168.1.1~आईपी पता है एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक आईपी एड्रेस 10.0.0.1~आईपी एड्रेस एक शून्य डॉट शून्य डॉट शून्य डॉट एक -ip address 192.168.1.1~आई पी एड्रेस एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक -ip address 10.0.0.1~आई पी एड्रेस एक शून्य डॉट शून्य डॉट शून्य डॉट एक +ip address 192.168.1.1~ip address एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक +ip address 10.0.0.1~ip address एक शून्य डॉट शून्य डॉट शून्य डॉट एक report.pdf~आर ई पी ओ आर टी डॉट पी डी एफ photo.jpg~पी एच ओ टी ओ डॉट जे पी जी data.csv~डेटा डॉट सी एस वी robinson.org~आर ओ बी आई एन एस ओ एन डॉट ऑर्ग anand@gmail.com~ए एन ए एन डी एट जीमेल डॉट कॉम -Al₂(SO₄)₃~एल्युमिनियम सल्फेट +Al₂(SO₄)₃~ए एल दो ओपन ब्रेकेट एस ओ चार क्लोज़ ब्रेकेट तीन C₂H₄~सी दो एच चार home/desktop~होम फॉरवर्ड स्लैश डेस्कटॉप \ No newline at end of file From 148d752516924a555b44d65e001d8e3e4b88ace2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 08:29:59 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/electronic.py | 18 +++++------ .../hi/verbalizers/electronic.py | 30 ++++++++++--------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/electronic.py b/nemo_text_processing/text_normalization/hi/taggers/electronic.py index 69422084c..a309ec089 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/taggers/electronic.py @@ -39,10 +39,10 @@ def __init__(self, deterministic: bool = True): alphanumeric = NEMO_ALPHA | NEMO_DIGIT | NEMO_HI_DIGIT | subscript_digit symbol_dict = {"email": [], "url": [], "unix": [], "windows": [], "chem": []} - + with open(get_abs_path("data/electronic/symbol_classes.tsv"), "r", encoding="utf-8") as f: for line in f: - if not line.strip(): + if not line.strip(): continue parts = line.strip().split("\t") if len(parts) == 2: @@ -57,7 +57,7 @@ def __init__(self, deterministic: bool = True): unix_symbols = pynini.union(*symbol_dict["unix"]) win_symbols = pynini.union(*symbol_dict["windows"]) chemical_symbols = pynini.union(*symbol_dict["chem"]) - + unix_segment_syms = pynini.union(*[s for s in symbol_dict["unix"] if s != "/"]) username_chars = NEMO_ALPHA | NEMO_DIGIT | email_symbols @@ -140,15 +140,15 @@ def __init__(self, deterministic: bool = True): ) elements = pynini.project(pynini.string_file(get_abs_path("data/electronic/elements.tsv")), "input") - + chem_number = pynini.closure(NEMO_DIGIT | subscript_digit, 1) - + chem_block = elements + pynini.closure(chem_number, 0, 1) - + chem_sequence_chars = chem_block | chemical_symbols | chem_number - + raw_chemical = pynini.closure(chemical_symbols) + chem_block + pynini.closure(chem_sequence_chars) - + any_chem = pynini.closure(chem_sequence_chars) has_open = any_chem + pynini.accep("(") + any_chem no_open = pynini.difference(any_chem, has_open) @@ -188,4 +188,4 @@ def __init__(self, deterministic: bool = True): ) self.graph = graph.optimize() - self.fst = self.add_tokens(graph).optimize() \ No newline at end of file + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py index 60381044d..398c79fef 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py @@ -43,12 +43,12 @@ class ElectronicFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) - symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() - domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() - server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() - common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() - latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) - latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() + symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() + domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() + server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() + common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() + latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) + latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() @@ -104,14 +104,16 @@ def make_content(alpha_run_verb, non_alpha_sep=None): ] ) - domain_alpha_run = make_alpha_run_verbalizer([ - (server_name_graph, 0.85), - (domain_graph, 0.87), - (common_words_graph, 0.90), - ]) - + domain_alpha_run = make_alpha_run_verbalizer( + [ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] + ) + domain_content = pynutil.add_weight(make_content(domain_alpha_run), 1.0) - + domain_only_graph = delete_domain_tag + domain_content + delete_quote protocol_only_graph = delete_protocol_tag + protocol_graph + insert_space + delete_quote + delete_space @@ -141,4 +143,4 @@ def make_content(alpha_run_verb, non_alpha_sep=None): ) delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize()