Skip to content

Commit 1be6e45

Browse files
authored
Merge branch 'main' into update-version-v0.9.7
2 parents af14115 + 2ce3c1f commit 1be6e45

File tree

9 files changed

+513
-11
lines changed

9 files changed

+513
-11
lines changed

charabia/benches/bench.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ static DATA_SET: &[((usize, Script, Language), &str)] = &[
1717
((130, Script::Greek, Language::Ell), "Οι θερμοκρασίες είναι σπάνια υπερβολικές στις παραθαλάσσιες περιοχές."),
1818
((132, Script::Khmer, Language::Khm), "ធ្វេីមនុស្សត្រូវចេះស្រលាញ់នឹងជួយគ្នាទៅវិញទៅមក ព្រោះពិភពលោកនេះមានទុកច្រេីនហេីយគួយតែមានអំពេីល្អច្រេីនមិនថាជួយបាន១រឺ២នាក់ច្រេីនរឺតិចទេ៕"),
1919
((132, Script::Arabic, Language::Ara), "اللُّغَةُ العربية هي أكثر اللغات السامية تحدثا، ومن أكثر اللغات انتشارا"),
20+
((128, Script::Arabic, Language::Pes), "قنات قصبه شهر گناباد عمیق‌ترین و قدیمی‌ترین کاریز جهان است."),
2021
((134, Script::Arabic, Language::Vie), "Các nhà nước trong lịch sử Việt Nam có những quốc hiệu khác nhau như Xích Quỷ, Văn Lang, Đại Việt, Đại"),
2122
((131, Script::Latin, Language::Deu), "Deutschland vereint Alpen, Küsten und Städte wie Berlin. Kultur und Geschichte prägen das Land, das Natur und Moderne verbindet."),
2223

@@ -31,6 +32,7 @@ static DATA_SET: &[((usize, Script, Language), &str)] = &[
3132
((364, Script::Greek, Language::Ell), "Η άνοιξη έχει μικρή διάρκεια, διότι ο μεν χειμώνας είναι όψιμος, το δε καλοκαίρι αρχίζει πρώιμα. Το φθινόπωρο είναι μακρύ και θερμό και πολλές φορές παρατείνεται στη νότια Ελλάδα και τα νησιά μέχρι τα"),
3233
((327, Script::Khmer, Language::Khm), "រឿងពីរដែលមនុស្សហាមចិត្តខ្លួនឯងមិនបានគឺ សើច និង ស្រឡាញ់។ តែសម្រាប់ខ្ញុំ ប្រាក់ ចន្ទធីតា រឿងមួយទៀតដែលខ្ញុំហាមចិត្តខ្លួនឯងមិនបាននោះ គឺញ៉ាំ គេគ្រប់គ្នាពេលខូចចិត្តបាយទឹកមិនបានទេ តែខ្ញុំពេលខូចចិត្តដឹងតែឃ្លាន ញ៉ាំច្រើនឬតិចក៏អាស្រ័យលើថាទំហំនៃការខូចចិត្តខ្លាំងឬខ្សោយ។"),
3334
((366, Script::Arabic, Language::Ara), "العربية لغةٌ رسمية في كل دول الوطن العربي (إضافة إلى كونها لغة رسمية في تشاد وإريتريا). وهي إحدى اللغات الرسمية الست في منظمة الأمم المتحدة، ويُحتفل بالعربية في 18 ديسمبر كذكرى اعتمادها في الأمم المتحدة."),
35+
((366, Script::Arabic, Language::Pes), "فارسی یکی از زبان‌های هندواروپایی و زبان رسمی ایران، افغانستان (با نام دری) و تاجیکستان (با نام تاجیکی) است. این زبان دارای پیشینه‌ای کهن و ادبیاتی غنی می‌باشد و در طول تاریخ، شاعران و نویسندگان بزرگی به این زبان آثار خود را خلق کرده‌اند. فارسی همچنین یکی از زبان‌های مهم منطقه به شمار می‌رود و در سازمان‌های بین‌المللی نیز جایگاه ویژه‌ای دارد."),
3436
((365, Script::Latin, Language::Vie), "Lãnh thổ Việt Nam xuất hiện con người sinh sống từ thời đại đồ đá cũ, khởi đầu với các nhà nước Văn Lang, Âu Lạc. Âu Lạc bị nhà Triệu ở phương Bắc thôn tính vào đầu thế kỷ thứ 2 TCN sau đó là thời kỳ Bắc thuộc kéo dài hơn một thiên niên kỷ.Chế độ quân chủ độc lập"),
3537
((354, Script::Latin, Language::Deu), "Magdeburg, die Hauptstadt Sachsen-Anhalts, beeindruckt mit dem Magdeburger Dom, dem Jahrtausendturm im Elbauenpark und dem Wasserstraßenkreuz. Der Domplatz ist umgeben von Bauwerken, wie dem Hundertwasserhaus. Der Elbauenpark bietet viele Freizeitmöglichkeiten, während die Magdeburger Börde für fruchtbare Ackerflächen für z.B. Zuckerrüben bekannt ist."),
3638
];

charabia/src/detection/chars.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,24 @@ pub(crate) fn is_arabic(ch: char) -> bool {
4242
)
4343
}
4444

45+
// Based on: https://en.wikipedia.org/wiki/Persian_alphabet
46+
pub(crate) fn is_persian(ch: char) -> bool {
47+
matches!(
48+
ch,
49+
// Persian-specific letters
50+
| '\u{067E}' // Peh
51+
| '\u{0686}' // Tcheh
52+
| '\u{0698}' // Jeh
53+
| '\u{06A9}' // Keheh (Persian Kaf)
54+
| '\u{06AF}' // Gaf
55+
| '\u{06CC}' // Farsi Yeh
56+
| '\u{06C0}' // Yeh with Hamza above (used in Dari)
57+
58+
// Persian digits
59+
| '\u{06F0}'..='\u{06F9}' // Zero to nine
60+
)
61+
}
62+
4563
// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode
4664
pub(crate) fn is_devanagari(ch: char) -> bool {
4765
matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
@@ -287,4 +305,17 @@ mod tests {
287305
assert!(is_hebrew('ׇ'));
288306
assert!(!is_hebrew('s'));
289307
}
308+
309+
#[test]
310+
fn test_is_persian() {
311+
assert!(is_persian('پ')); // Peh
312+
assert!(is_persian('ژ')); // Zheh
313+
assert!(is_persian('گ')); // Gaf
314+
assert!(is_persian('ک')); // Kaf
315+
assert!(is_persian('ی')); // Yeh
316+
assert!(is_persian('۱')); // Latin '1'
317+
assert!(is_persian('۲')); // Latin '2'
318+
319+
assert!(!is_persian('z')); // Latin 'z'
320+
}
290321
}

charabia/src/detection/script_language.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ impl From<char> for Script {
188188
Script::Latin
189189
} else if chars::is_cyrillic(other) {
190190
Script::Cyrillic
191-
} else if chars::is_arabic(other) {
191+
} else if chars::is_arabic(other) || chars::is_persian(other) {
192192
Script::Arabic
193193
} else if chars::is_devanagari(other) {
194194
Script::Devanagari

charabia/src/normalizer/arabic.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ impl CharNormalizer for ArabicNormalizer {
2727
fn normalize_arabic_char(c: char) -> Option<CharOrStr> {
2828
match c {
2929
'ـ' => None,
30-
'ٱ' => Some('ا'.into()),
30+
'أ' | 'إ' | 'آ' | 'ٱ' => Some('ا'.into()), // All Alef variants to Alef
3131
'ى' => Some('ي'.into()),
3232
'ة' => Some('ه'.into()),
3333
_ => Some(c.into()),
3434
}
3535
}
3636

3737
fn is_shoud_normalize(c: char) -> bool {
38-
matches!(c, 'ـ' | 'ٱ' | 'ى' | 'ة')
38+
matches!(c, 'ـ' | 'أ' | 'إ' | 'آ' | 'ٱ' | 'ى' | 'ة')
3939
}
4040

4141
#[cfg(test)]

charabia/src/normalizer/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
use std::borrow::Cow;
22

3-
use once_cell::sync::Lazy;
4-
53
pub use self::ae_oe_normalizer::AeOeNormalizer;
64
pub use self::arabic::ArabicNormalizer;
75
#[cfg(feature = "chinese-normalization")]
@@ -15,6 +13,7 @@ use self::greek::GreekNormalizer;
1513
pub use self::japanese::JapaneseNormalizer;
1614
pub use self::lowercase::LowercaseNormalizer;
1715
use self::nonspacing_mark::NonspacingMarkNormalizer;
16+
pub use self::persian::PersianNormalizer;
1817
use self::quote::QuoteNormalizer;
1918
#[cfg(feature = "swedish-recomposition")]
2019
use self::swedish_recomposition::SwedishRecompositionNormalizer;
@@ -24,6 +23,7 @@ pub use self::turkish::TurkishNormalizer;
2423
pub use self::vietnamese::VietnameseNormalizer;
2524
use crate::segmenter::SegmentedTokenIter;
2625
use crate::Token;
26+
use once_cell::sync::Lazy;
2727

2828
mod arabic;
2929
#[cfg(feature = "chinese-normalization")]
@@ -46,6 +46,7 @@ mod turkish;
4646
mod vietnamese;
4747

4848
mod ae_oe_normalizer;
49+
mod persian;
4950

5051
/// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
5152
pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
@@ -55,6 +56,7 @@ pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
5556
Box::new(SwedishRecompositionNormalizer),
5657
Box::new(ControlCharNormalizer),
5758
Box::new(Classifier),
59+
Box::new(PersianNormalizer),
5860
]
5961
});
6062

0 commit comments

Comments
 (0)