Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pipeline {
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-13-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-26-26-0'
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
H2O जल
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it doesn't make sense to hardcode some chemical compounds and not cover others. if we want coverage for this, it would make more sense to have all individual elements and rules to cover their composition

H₂O जल
CO2 कार्बन डाइऑक्साइड
CO₂ कार्बन डाइऑक्साइड
O2 ऑक्सीजन
O₂ ऑक्सीजन
N2 नाइट्रोजन
N₂ नाइट्रोजन
NaCl सोडियम क्लोराइड
HCl हाइड्रोक्लोरिक एसिड
H2SO4 सल्फ्यूरिक एसिड
H₂SO₄ सल्फ्यूरिक एसिड
HNO3 नाइट्रिक एसिड
HNO₃ नाइट्रिक एसिड
NH3 अमोनिया
NH₃ अमोनिया
CH4 मीथेन
CH₄ मीथेन
NaOH सोडियम हाइड्रॉक्साइड
KOH पोटेशियम हाइड्रॉक्साइड
Ca(OH)2 कैल्शियम हाइड्रॉक्साइड
Ca(OH)₂ कैल्शियम हाइड्रॉक्साइड
CaCO3 कैल्शियम कार्बोनेट
CaCO₃ कैल्शियम कार्बोनेट
C6H12O6 ग्लूकोज़
C₆H₁₂O₆ ग्लूकोज़
NaHCO3 सोडियम बाइकार्बोनेट
NaHCO₃ सोडियम बाइकार्बोनेट
Na2CO3 सोडियम कार्बोनेट
Na₂CO₃ सोडियम कार्बोनेट
CH₃COO– एसीटेट आयन
CH3COO- एसीटेट आयन
CH₃COO⁻ एसीटेट आयन
Ba(OH)2 बेरियम हाइड्रॉक्साइड
Ba(OH)₂ बेरियम हाइड्रॉक्साइड
Al2(SO4)3 एल्युमिनियम सल्फेट
Al₂(SO₄)₃ एल्युमिनियम सल्फेट
64 changes: 44 additions & 20 deletions nemo_text_processing/text_normalization/hi/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ElectronicFst(GraphFst):
e.g. [email protected] -> tokens { electronic { username: "kumar" domain: "gmail.com" } }
e.g. https://google.com/ -> tokens { electronic { protocol: "https" domain: "google.com/" } }
e.g. C:\\Users\\HP\\Desktop -> tokens { electronic { path: "C:\\Users\\HP\\Desktop" } }
e.g. 192.168.1.1 -> tokens { electronic { ip: "192.168.1.1" } }
e.g. 192.168.1.1 -> tokens { electronic { domain: "192.168.1.1" } }

"""

Expand All @@ -39,7 +39,6 @@ def __init__(self, deterministic: bool = True):

alphanumeric = NEMO_ALPHA | NEMO_DIGIT | NEMO_HI_DIGIT | subscript_digit

# email
username_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep(".") | pynini.accep("-") | pynini.accep("_")
username = pynutil.insert("username: \"") + pynini.closure(username_chars, 1) + pynutil.insert("\"")

Expand All @@ -48,7 +47,6 @@ def __init__(self, deterministic: bool = True):

email_graph = username + pynini.cross("@", "") + domain

# url: protocol handling for https://, http://, www., and combined forms
protocol_start = pynini.cross("https://", "https") | pynini.cross("http://", "http")
protocol_end = pynini.cross("www.", "www")
protocol = (
Expand Down Expand Up @@ -80,7 +78,6 @@ def __init__(self, deterministic: bool = True):

url_graph = protocol + url_domain

# file paths: Windows (C:\...), Unix (/...), and backslash-prefixed (\...)
drive_letter = NEMO_ALPHA
windows_path_chars = alphanumeric | pynini.union(
pynini.accep("\\"),
Expand All @@ -107,9 +104,20 @@ def __init__(self, deterministic: bool = True):
pynini.accep("_"),
pynini.accep("$"),
)
unix_path = (
pynutil.insert("path: \"") + pynini.accep("/") + pynini.closure(unix_path_chars, 1) + pynutil.insert("\"")

unix_segment_chars = alphanumeric | pynini.union(
pynini.accep("."),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add these with a tsv instead of hardcoding

pynini.accep("-"),
pynini.accep("_"),
pynini.accep("$"),
)
unix_segment = pynini.closure(unix_segment_chars, 1)

abs_unix_path = pynini.accep("/") + pynini.closure(unix_path_chars, 1)

rel_unix_path = unix_segment + pynini.accep("/") + pynini.closure(unix_path_chars, 0)

unix_path = pynutil.insert("path: \"") + (abs_unix_path | rel_unix_path) + pynutil.insert("\"")

backslash_path_chars = alphanumeric | pynini.union(
pynini.accep("\\"),
Expand All @@ -125,12 +133,10 @@ def __init__(self, deterministic: bool = True):
+ pynutil.insert("\"")
)

# ip addresses: exactly 4 dot-separated octets
ip_octet = pynini.closure(NEMO_DIGIT, 1, 3)
dot_octet = pynini.accep(".") + ip_octet
ip_address = pynutil.insert("domain: \"") + ip_octet + pynini.closure(dot_octet, 3, 3) + pynutil.insert("\"")

# domains: simple TLD-based (abc.com) and government/education suffixes (.gov.in, .ac.in)
domain_segment_chars = NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-")
domain_segment = pynini.closure(domain_segment_chars, 1)

Expand All @@ -144,7 +150,6 @@ def __init__(self, deterministic: bool = True):
pynutil.insert("domain: \"") + domain_body + pynini.closure(pynini.accep("/"), 0, 1) + pynutil.insert("\"")
)

# file extensions: e.g. report.pdf, data.csv
known_extensions = pynini.project(
pynini.string_file(get_abs_path("data/electronic/file_extensions.tsv")), "input"
)
Expand All @@ -154,27 +159,46 @@ def __init__(self, deterministic: bool = True):
pynutil.insert("domain: \"") + filename_stem + pynini.accep(".") + known_extensions + pynutil.insert("\"")
)

# chemical formulas with subscript digits: e.g. H₂O, CO₂
chemical_chars = NEMO_ALPHA | subscript_digit
chemical_formula = (
pynutil.insert("domain: \"") + NEMO_ALPHA + pynini.closure(chemical_chars, 1) + pynutil.insert("\"")
chemical_chars = (
NEMO_ALPHA
| NEMO_DIGIT
| subscript_digit
| pynini.accep("(")
| pynini.accep(")")
| pynini.accep("+")
| pynini.accep("-")
| pynini.accep("–")
)

# alphanumeric codes: strings containing both letters and digits,
# optionally separated by hyphens, e.g. IELF004, N95, GSAT-18, F-35B
raw_chemical = NEMO_ALPHA + pynini.closure(chemical_chars, 1)

any_chem = pynini.closure(chemical_chars)
has_open = any_chem + pynini.accep("(") + any_chem
no_open = pynini.difference(any_chem, has_open)
ends_with_close = any_chem + pynini.accep(")")

unbalanced_trailing = pynini.intersect(no_open, ends_with_close)

valid_chemical = pynini.difference(raw_chemical, unbalanced_trailing).optimize()

chemical_formula = pynutil.insert("domain: \"") + valid_chemical + pynutil.insert("\"")

alnum_seg = pynini.closure(NEMO_ALPHA | NEMO_DIGIT, 1)
alphanumeric_pattern = alnum_seg + pynini.closure(pynini.accep("-") + alnum_seg)

alnum_hyp_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-"))
contains_alpha = alnum_hyp_sigma + NEMO_ALPHA + alnum_hyp_sigma
contains_digit = alnum_hyp_sigma + NEMO_DIGIT + alnum_hyp_sigma
separator = pynini.accep("-") | pynini.accep(".")
alphanumeric_pattern = alnum_seg + pynini.closure(separator + alnum_seg)

alnum_hyp_dot_sigma = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep("-") | pynini.accep("."))

contains_alpha = alnum_hyp_dot_sigma + NEMO_ALPHA + alnum_hyp_dot_sigma
contains_digit = alnum_hyp_dot_sigma + NEMO_DIGIT + alnum_hyp_dot_sigma

alphanumeric_code_fst = pynini.intersect(
pynini.intersect(alphanumeric_pattern, contains_alpha), contains_digit
).optimize()

alphanumeric_code = pynutil.insert("domain: \"") + alphanumeric_code_fst + pynutil.insert("\"")

# Weights use 3 tiers: structurally unambiguous (1.0), moderately general (1.1), greedy (1.2)
graph = (
pynutil.add_weight(url_graph, 1.0)
| pynutil.add_weight(email_graph, 1.0)
Expand Down
106 changes: 48 additions & 58 deletions nemo_text_processing/text_normalization/hi/verbalizers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ class ElectronicFst(GraphFst):
Uses a phonetic-first approach with letter-by-letter fallback.

Examples:
electronic { username: "kumar" domain: "gmail.com" } -> "कुमार एट जीमेल डॉट कॉम"
electronic { username: "kumar" domain: "gmail.com" } -> "के यू एम ए आर एट जीमेल डॉट कॉम"
electronic { protocol: "https" domain: "google.com/" } -> "एच टी टी पी एस कोलन फॉरवर्ड स्लैश फॉरवर्ड स्लैश गूगल डॉट कॉम फॉरवर्ड स्लैश"
electronic { path: "C:\\Users\\HP" } -> "सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी"
electronic { ip: "192.168.1.1" } -> "एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक"
electronic { path: "C:\\Users\\HP\\Desktop" } -> "सी कोलन बैकवर्ड स्लैश यूज़र्स बैकवर्ड स्लैश एच पी बैकवर्ड स्लैश डेस्कटॉप"
electronic { domain: "192.168.1.1" } -> "एक नौ दो डॉट एक छह आठ डॉट एक डॉट एक"

Args:
deterministic: if True will provide a single transduction option,
Expand All @@ -43,102 +43,92 @@ class ElectronicFst(GraphFst):
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)

# Load data files
symbols_graph = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).optimize()
domain_graph = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize()
server_name_graph = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).optimize()
chemical_graph = pynini.string_file(get_abs_path("data/electronic/chemical_names.tsv")).optimize()
common_words_graph = pynini.string_file(get_abs_path("data/electronic/common_words.tsv")).optimize()
latin_to_hindi_graph = pynini.string_file(get_abs_path("data/address/letters.tsv"))
latin_to_hindi_graph = capitalized_input_graph(latin_to_hindi_graph).optimize()

# Digit mappings - use telephone number mappings for ASCII digits
ascii_digit_graph = pynini.string_file(get_abs_path("data/telephone/number.tsv")).optimize()
hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
hindi_zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize()
subscript_digit_graph = pynini.string_file(get_abs_path("data/electronic/subscript_digit.tsv")).optimize()
digit_verbalization = ascii_digit_graph | hindi_digit_graph | hindi_zero_graph | subscript_digit_graph

# Combined phonetic word graph: server names + common words
phonetic_word = server_name_graph | common_words_graph
protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize()

# ============ CHARACTER VERBALIZATION ============
# Single character to Hindi verbalization with space insertion
char_to_hindi = pynutil.add_weight(latin_to_hindi_graph, 1.0) | pynutil.add_weight( # Letter mapping
digit_verbalization, 1.0
) # Digit mapping
char_with_space = char_to_hindi + insert_space
single_letter = latin_to_hindi_graph + insert_space
single_digit = digit_verbalization + insert_space
single_symbol = symbols_graph + insert_space

# ============ SYMBOL VERBALIZATION ============
symbol_to_hindi = symbols_graph + insert_space
single_non_alpha = pynutil.add_weight(single_symbol, 1.0) | pynutil.add_weight(single_digit, 1.0)

# ============ DOMAIN VERBALIZATION ============
# Domain extension verbalization (.com -> डॉट कॉम)
domain_ext_verbalization = pynini.cross(".", "डॉट ") + domain_graph + insert_space
def make_alpha_run_verbalizer(tsv_graphs):
phonetic = pynini.union(*[pynutil.add_weight(g + insert_space, w) for g, w in tsv_graphs])
literal = pynutil.add_weight(pynini.closure(single_letter, 1), 1.1)
return phonetic | literal

# ============ PROTOCOL VERBALIZATION ============
protocol_graph = pynini.string_file(get_abs_path("data/electronic/protocols.tsv")).optimize()
protocol_verbalization = protocol_graph + insert_space
def make_content(alpha_run_verb, non_alpha_sep=None):
if non_alpha_sep is None:
non_alpha_sep = single_non_alpha
mandatory_sep = pynini.closure(non_alpha_sep, 1)
return (
pynini.closure(non_alpha_sep, 0)
+ pynini.closure(alpha_run_verb + mandatory_sep, 0)
+ pynini.closure(alpha_run_verb, 0, 1)
+ pynini.closure(non_alpha_sep, 0)
)

# ============ FIELD EXTRACTION ============
# Extract username field
delete_username_tag = pynutil.delete("username: \"")
delete_domain_tag = pynutil.delete("domain: \"")
delete_protocol_tag = pynutil.delete("protocol: \"")
delete_path_tag = pynutil.delete("path: \"")
delete_quote = pynutil.delete("\"")

# Username verbalization: letter-by-letter with symbol handling
username_content = pynini.closure(
pynutil.add_weight(phonetic_word + insert_space, 0.9)
| pynutil.add_weight(symbol_to_hindi, 1.0)
| pynutil.add_weight(char_with_space, 1.1),
1,
username_alpha_run = make_alpha_run_verbalizer(
[
(server_name_graph, 0.85),
(domain_graph, 0.87),
(common_words_graph, 0.90),
]
)

username_graph = (
delete_username_tag + username_content + delete_quote + delete_space + pynutil.insert("एट ") # @ symbol
username_content = make_content(username_alpha_run)
username_graph = delete_username_tag + username_content + delete_quote + delete_space + pynutil.insert("एट ")

domain_alpha_run = make_alpha_run_verbalizer(
[
(server_name_graph, 0.85),
(domain_graph, 0.87),
(common_words_graph, 0.90),
]
)

# Domain verbalization
domain_content = pynini.closure(
pynutil.add_weight(phonetic_word + insert_space, 0.9)
| pynutil.add_weight(domain_ext_verbalization, 0.95)
| pynutil.add_weight(symbol_to_hindi, 1.0)
| pynutil.add_weight(char_with_space, 1.1),
1,
domain_content = pynutil.add_weight(chemical_graph + insert_space, 0.8) | pynutil.add_weight(
make_content(domain_alpha_run), 1.0
)

domain_only_graph = delete_domain_tag + domain_content + delete_quote

# Protocol verbalization
protocol_only_graph = delete_protocol_tag + protocol_verbalization + delete_quote + delete_space
protocol_only_graph = delete_protocol_tag + protocol_graph + insert_space + delete_quote + delete_space

# Path verbalization (Windows/Unix file paths)
path_content = pynini.closure(
pynutil.add_weight(common_words_graph + insert_space, 0.9)
| pynutil.add_weight(symbol_to_hindi, 1.0)
| pynutil.add_weight(char_with_space, 1.1),
1,
path_alpha_run = make_alpha_run_verbalizer(
[
(domain_graph, 0.87),
(common_words_graph, 0.90),
]
)

path_content = make_content(path_alpha_run)
path_graph = delete_path_tag + path_content + delete_quote

# IP address verbalization (digit by digit)
ip_char = pynutil.add_weight(symbols_graph + insert_space, 1.0) | pynutil.add_weight(
digit_verbalization + insert_space, 1.0
)
ip_char = single_symbol | single_digit
ip_content = pynini.closure(ip_char, 1)

ip_graph = delete_domain_tag + ip_content + delete_quote

# ============ COMBINED GRAPH ============
# Email: username + domain
email_full = username_graph + domain_only_graph

# URL with protocol: protocol + domain
url_full = protocol_only_graph + domain_only_graph

# Combined final graph
graph = (
pynutil.add_weight(url_full, 1.0)
| pynutil.add_weight(email_full, 1.01)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,9 @@ ip address 192.168.1.1~आई पी एड्रेस एक नौ दो ड
ip address 10.0.0.1~आई पी एड्रेस एक शून्य डॉट शून्य डॉट शून्य डॉट एक
report.pdf~आर ई पी ओ आर टी डॉट पी डी एफ
photo.jpg~पी एच ओ टी ओ डॉट जे पी जी
data.csv~डेटा डॉट सी एस वी
data.csv~डेटा डॉट सी एस वी
robinson.org~आर ओ बी आई एन एस ओ एन डॉट ऑर्ग
[email protected]~ए एन ए एन डी एट जीमेल डॉट कॉम
Al₂(SO₄)₃~एल्युमिनियम सल्फेट
C₂H₄~सी दो एच चार
home/desktop~होम फॉरवर्ड स्लैश डेस्कटॉप