diff --git a/Jenkinsfile b/Jenkinsfile index d9c3a5984..63986e94e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-13-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-26-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv b/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv new file mode 100644 index 000000000..3dbecc3bf --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/electronic/chemical_names.tsv @@ -0,0 +1,37 @@ +H2O जल +H₂O जल +CO2 कार्बन डाइऑक्साइड +CO₂ कार्बन डाइऑक्साइड +O2 ऑक्सीजन +O₂ ऑक्सीजन +N2 नाइट्रोजन +N₂ नाइट्रोजन +NaCl सोडियम क्लोराइड +HCl हाइड्रोक्लोरिक एसिड +H2SO4 सल्फ्यूरिक एसिड +H₂SO₄ सल्फ्यूरिक एसिड +HNO3 नाइट्रिक एसिड +HNO₃ नाइट्रिक एसिड +NH3 अमोनिया +NH₃ अमोनिया +CH4 मीथेन +CH₄ मीथेन +NaOH सोडियम हाइड्रॉक्साइड +KOH पोटेशियम हाइड्रॉक्साइड +Ca(OH)2 कैल्शियम हाइड्रॉक्साइड +Ca(OH)₂ कैल्शियम हाइड्रॉक्साइड +CaCO3 कैल्शियम कार्बोनेट +CaCO₃ कैल्शियम कार्बोनेट +C6H12O6 ग्लूकोज़ +C₆H₁₂O₆ ग्लूकोज़ +NaHCO3 सोडियम बाइकार्बोनेट +NaHCO₃ सोडियम बाइकार्बोनेट +Na2CO3 सोडियम कार्बोनेट +Na₂CO₃ सोडियम कार्बोनेट +CH₃COO– एसीटेट आयन +CH3COO- एसीटेट आयन +CH₃COO⁻ एसीटेट आयन +Ba(OH)2 बेरियम हाइड्रॉक्साइड +Ba(OH)₂ बेरियम हाइड्रॉक्साइड +Al2(SO4)3 एल्युमिनियम सल्फेट +Al₂(SO₄)₃ एल्युमिनियम सल्फेट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/electronic.py b/nemo_text_processing/text_normalization/hi/taggers/electronic.py index 7807117e6..980a7e604 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/taggers/electronic.py @@ -26,7 +26,7 @@ class ElectronicFst(GraphFst): e.g. kumar@gmail.com -> tokens { electronic { username: "kumar" domain: "gmail.com" } } e.g. https://google.com/ -> tokens { electronic { protocol: "https" domain: "google.com/" } } e.g. C:\\Users\\HP\\Desktop -> tokens { electronic { path: "C:\\Users\\HP\\Desktop" } } - e.g. 192.168.1.1 -> tokens { electronic { ip: "192.168.1.1" } } + e.g. 192.168.1.1 -> tokens { electronic { domain: "192.168.1.1" } } """ @@ -39,7 +39,6 @@ def __init__(self, deterministic: bool = True): alphanumeric = NEMO_ALPHA | NEMO_DIGIT | NEMO_HI_DIGIT | subscript_digit - # email username_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep(".") | pynini.accep("-") | pynini.accep("_") username = pynutil.insert("username: \"") + pynini.closure(username_chars, 1) + pynutil.insert("\"") @@ -48,7 +47,6 @@ def __init__(self, deterministic: bool = True): email_graph = username + pynini.cross("@", "") + domain - # url: protocol handling for https://, http://, www., and combined forms protocol_start = pynini.cross("https://", "https") | pynini.cross("http://", "http") protocol_end = pynini.cross("www.", "www") protocol = ( @@ -80,7 +78,6 @@ def __init__(self, deterministic: bool = True): url_graph = protocol + url_domain - # file paths: Windows (C:\...), Unix (/...), and backslash-prefixed (\...) drive_letter = NEMO_ALPHA windows_path_chars = alphanumeric | pynini.union( pynini.accep("\\"), @@ -107,9 +104,20 @@ def __init__(self, deterministic: bool = True): pynini.accep("_"), pynini.accep("$"), ) - unix_path = ( - pynutil.insert("path: \"") + pynini.accep("/") + pynini.closure(unix_path_chars, 1) + pynutil.insert("\"") + + unix_segment_chars = alphanumeric | pynini.union( + pynini.accep("."), + pynini.accep("-"), + pynini.accep("_"), + pynini.accep("$"), ) + unix_segment = pynini.closure(unix_segment_chars, 1) + + abs_unix_path = pynini.accep("/") + pynini.closure(unix_path_chars, 1) + + rel_unix_path = unix_segment + pynini.accep("/") + pynini.closure(unix_path_chars, 0) + + unix_path = pynutil.insert("path: \"") + (abs_unix_path | rel_unix_path) + pynutil.insert("\"") backslash_path_chars = alphanumeric | pynini.union( pynini.accep("\\"), @@ -125,12 +133,10 @@ def __init__(self, deterministic: bool = True): + pynutil.insert("\"") ) - # ip addresses: exactly 4 dot-separated octets ip_octet = pynini.closure(NEMO_DIGIT, 1, 3) dot_octet = pynini.accep(".") + ip_octet ip_address = pynutil.insert("domain: \"") + ip_octet + pynini.closure(dot_octet, 3, 3) + pynutil.insert("\"") - # domains: simple TLD-based (abc.com) and government/education suffixes (.gov.in, .ac.in) domain_segment_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-") domain_segment = pynini.closure(domain_segment_chars, 1) @@ -144,7 +150,6 @@ def __init__(self, deterministic: bool = True): pynutil.insert("domain: \"") + domain_body + pynini.closure(pynini.accep("/"), 0, 1) + pynutil.insert("\"") ) - # file extensions: e.g. report.pdf, data.csv known_extensions = pynini.project( pynini.string_file(get_abs_path("data/electronic/file_extensions.tsv")), "input" ) @@ -154,27 +159,46 @@ def __init__(self, deterministic: bool = True): pynutil.insert("domain: \"") + filename_stem + pynini.accep(".") + known_extensions + pynutil.insert("\"") ) - # chemical formulas with subscript digits: e.g. H₂O, CO₂ - chemical_chars = NEMO_ALPHA | subscript_digit - chemical_formula = ( - pynutil.insert("domain: \"") + NEMO_ALPHA + pynini.closure(chemical_chars, 1) + pynutil.insert("\"") + chemical_chars = ( + NEMO_ALPHA + | NEMO_DIGIT + | subscript_digit + | pynini.accep("(") + | pynini.accep(")") + | pynini.accep("+") + | pynini.accep("-") + | pynini.accep("–") ) - # alphanumeric codes: strings containing both letters and digits, - # optionally separated by hyphens, e.g. IELF004, N95, GSAT-18, F-35B + raw_chemical = NEMO_ALPHA + pynini.closure(chemical_chars, 1) + + any_chem = pynini.closure(chemical_chars) + has_open = any_chem + pynini.accep("(") + any_chem + no_open = pynini.difference(any_chem, has_open) + ends_with_close = any_chem + pynini.accep(")") + + unbalanced_trailing = pynini.intersect(no_open, ends_with_close) + + valid_chemical = pynini.difference(raw_chemical, unbalanced_trailing).optimize() + + chemical_formula = pynutil.insert("domain: \"") + valid_chemical + pynutil.insert("\"") + alnum_seg = pynini.closure(NEMO_ALPHA | NEMO_DIGIT, 1) - alphanumeric_pattern = alnum_seg + pynini.closure(pynini.accep("-") + alnum_seg) - alnum_hyp_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-")) - contains_alpha = alnum_hyp_sigma + NEMO_ALPHA + alnum_hyp_sigma - contains_digit = alnum_hyp_sigma + NEMO_DIGIT + alnum_hyp_sigma + separator = pynini.accep("-") | pynini.accep(".") + alphanumeric_pattern = alnum_seg + pynini.closure(separator + alnum_seg) + + alnum_hyp_dot_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-") | pynini.accep(".")) + + contains_alpha = alnum_hyp_dot_sigma + NEMO_ALPHA + alnum_hyp_dot_sigma + contains_digit = alnum_hyp_dot_sigma + NEMO_DIGIT + alnum_hyp_dot_sigma + alphanumeric_code_fst = pynini.intersect( pynini.intersect(alphanumeric_pattern, contains_alpha), contains_digit ).optimize() alphanumeric_code = pynutil.insert("domain: \"") + alphanumeric_code_fst + pynutil.insert("\"") - # Weights use 3 tiers: structurally unambiguous (1.0), moderately general (1.1), greedy (1.2) graph = ( pynutil.add_weight(url_graph, 1.0) | pynutil.add_weight(email_graph, 1.0) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py index 124e1c60b..e1b387a2e 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/electronic.py @@ -30,10 +30,10 @@ class ElectronicFst(GraphFst): Uses a phonetic-first approach with letter-by-letter fallback. Examples: - electronic { username: "kumar" domain: "gmail.com" } -> "कुमार एट जीमेल डॉट कॉम" + electronic { username: "kumar" domain: "gmail.com" } -> "के यू एम ए आर एट जीमेल डॉट कॉम" electronic { protocol: "https" domain: "google.com/" } -> "एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश गूगल डॉट कॉम फॉरवर्ड स्लैश" - electronic { path: "C:\\Users\\HP" } -> "सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी" - electronic { ip: "192.168.1.1" } -> "एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक" + electronic { path: "C:\\Users\\HP\\Desktop" } -> "सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी बैकवर्ड स्लैश डेस्कटॉप" + electronic { domain: "192.168.1.1" } -> "एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक" Args: deterministic: if True will provide a single transduction option, @@ -43,102 +43,92 @@ class ElectronicFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) - # Load data files symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize() domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize() + chemical_graph = pynini.string_file(get_abs_path("data/electronic/chemical_names.tsv")).optimize() common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize() latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize() - # Digit mappings - use telephone number mappings for ASCII digits ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize() hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() hindi_zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() subscript_digit_graph = pynini.string_file(get_abs_path("data/electronic/subscript_digit.tsv")).optimize() digit_verbalization = ascii_digit_graph | hindi_digit_graph | hindi_zero_graph | subscript_digit_graph - # Combined phonetic word graph: server names + common words - phonetic_word = server_name_graph | common_words_graph + protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize() - # ============ CHARACTER VERBALIZATION ============ - # Single character to Hindi verbalization with space insertion - char_to_hindi = pynutil.add_weight(latin_to_hindi_graph, 1.0) | pynutil.add_weight( # Letter mapping - digit_verbalization, 1.0 - ) # Digit mapping - char_with_space = char_to_hindi + insert_space + single_letter = latin_to_hindi_graph + insert_space + single_digit = digit_verbalization + insert_space + single_symbol = symbols_graph + insert_space - # ============ SYMBOL VERBALIZATION ============ - symbol_to_hindi = symbols_graph + insert_space + single_non_alpha = pynutil.add_weight(single_symbol, 1.0) | pynutil.add_weight(single_digit, 1.0) - # ============ DOMAIN VERBALIZATION ============ - # Domain extension verbalization (.com -> डॉट कॉम) - domain_ext_verbalization = pynini.cross(".", "डॉट ") + domain_graph + insert_space + def make_alpha_run_verbalizer(tsv_graphs): + phonetic = pynini.union(*[pynutil.add_weight(g + insert_space, w) for g, w in tsv_graphs]) + literal = pynutil.add_weight(pynini.closure(single_letter, 1), 1.1) + return phonetic | literal - # ============ PROTOCOL VERBALIZATION ============ - protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize() - protocol_verbalization = protocol_graph + insert_space + def make_content(alpha_run_verb, non_alpha_sep=None): + if non_alpha_sep is None: + non_alpha_sep = single_non_alpha + mandatory_sep = pynini.closure(non_alpha_sep, 1) + return ( + pynini.closure(non_alpha_sep, 0) + + pynini.closure(alpha_run_verb + mandatory_sep, 0) + + pynini.closure(alpha_run_verb, 0, 1) + + pynini.closure(non_alpha_sep, 0) + ) - # ============ FIELD EXTRACTION ============ - # Extract username field delete_username_tag = pynutil.delete("username: \"") delete_domain_tag = pynutil.delete("domain: \"") delete_protocol_tag = pynutil.delete("protocol: \"") delete_path_tag = pynutil.delete("path: \"") delete_quote = pynutil.delete("\"") - # Username verbalization: letter-by-letter with symbol handling - username_content = pynini.closure( - pynutil.add_weight(phonetic_word + insert_space, 0.9) - | pynutil.add_weight(symbol_to_hindi, 1.0) - | pynutil.add_weight(char_with_space, 1.1), - 1, + username_alpha_run = make_alpha_run_verbalizer( + [ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] ) - - username_graph = ( - delete_username_tag + username_content + delete_quote + delete_space + pynutil.insert("एट ") # @ symbol + username_content = make_content(username_alpha_run) + username_graph = delete_username_tag + username_content + delete_quote + delete_space + pynutil.insert("एट ") + + domain_alpha_run = make_alpha_run_verbalizer( + [ + (server_name_graph, 0.85), + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] ) - # Domain verbalization - domain_content = pynini.closure( - pynutil.add_weight(phonetic_word + insert_space, 0.9) - | pynutil.add_weight(domain_ext_verbalization, 0.95) - | pynutil.add_weight(symbol_to_hindi, 1.0) - | pynutil.add_weight(char_with_space, 1.1), - 1, + domain_content = pynutil.add_weight(chemical_graph + insert_space, 0.8) | pynutil.add_weight( + make_content(domain_alpha_run), 1.0 ) domain_only_graph = delete_domain_tag + domain_content + delete_quote - # Protocol verbalization - protocol_only_graph = delete_protocol_tag + protocol_verbalization + delete_quote + delete_space + protocol_only_graph = delete_protocol_tag + protocol_graph + insert_space + delete_quote + delete_space - # Path verbalization (Windows/Unix file paths) - path_content = pynini.closure( - pynutil.add_weight(common_words_graph + insert_space, 0.9) - | pynutil.add_weight(symbol_to_hindi, 1.0) - | pynutil.add_weight(char_with_space, 1.1), - 1, + path_alpha_run = make_alpha_run_verbalizer( + [ + (domain_graph, 0.87), + (common_words_graph, 0.90), + ] ) - + path_content = make_content(path_alpha_run) path_graph = delete_path_tag + path_content + delete_quote - # IP address verbalization (digit by digit) - ip_char = pynutil.add_weight(symbols_graph + insert_space, 1.0) | pynutil.add_weight( - digit_verbalization + insert_space, 1.0 - ) + ip_char = single_symbol | single_digit ip_content = pynini.closure(ip_char, 1) - ip_graph = delete_domain_tag + ip_content + delete_quote - # ============ COMBINED GRAPH ============ - # Email: username + domain email_full = username_graph + domain_only_graph - - # URL with protocol: protocol + domain url_full = protocol_only_graph + domain_only_graph - # Combined final graph graph = ( pynutil.add_weight(url_full, 1.0) | pynutil.add_weight(email_full, 1.01) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt index 85b34c4a3..580a3de15 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_electronic.txt @@ -57,4 +57,9 @@ ip address 192.168.1.1~आई पी एड्रेस एक नौ दो ड ip address 10.0.0.1~आई पी एड्रेस एक शून्य डॉट शून्य डॉट शून्य डॉट एक report.pdf~आर ई पी ओ आर टी डॉट पी डी एफ photo.jpg~पी एच ओ टी ओ डॉट जे पी जी -data.csv~डेटा डॉट सी एस वी \ No newline at end of file +data.csv~डेटा डॉट सी एस वी +robinson.org~आर ओ बी आई एन एस ओ एन डॉट ऑर्ग +anand@gmail.com~ए एन ए एन डी एट जीमेल डॉट कॉम +Al₂(SO₄)₃~एल्युमिनियम सल्फेट +C₂H₄~सी दो एच चार +home/desktop~होम फॉरवर्ड स्लैश डेस्कटॉप \ No newline at end of file