javascript正则表达式匹配字符串包含阿拉伯特殊字符符号'œ;u0640

javascript regex to match string contain arabic special characters symbols 'ـ' u0640

本文关键字:符号 特殊字符 u0640 阿拉伯 正则表达式 字符串 包含 javascript      更新时间:2023-09-26

我需要一个javascript正则表达式,它可以找到包含"qd"的阿拉伯语文本,例如:

如果我搜索这个词:الله

输出将发现:الله

اللـه

الـله

الـلـه

将其与下一个突出显示文本的脚本一起使用。

请参阅此演示。


更新:假设我想用英语搜索这个字符串:

"中空的";

我想要一个正则表达式能够找到单词:"中空的";还要查找包含此符号的单词"在Unicode中是u0640,例如:

";h"o"l"l"o"w"

或重复

";h

现在,我之前提到的脚本可以在阿拉伯语中找到并突出显示变音符号和非变音符号文本,也可以在英语中找到重音和非重音敏感文本

例如,脚本可以找到:

带变音符号的文本:";و

不带变音符号的相同文本:";وديناالددا''1606Bب

同样是不区分重音的英语:

";séléction:可以找到

";选择":可以发现它的大写字母太小无所谓

通过使用xregex,我可以通过以下代码完成映射:

 var charToAccentedCharClassMap = {
// for arabic
"'u0623":'(['u0627'u0625'u0622'u0623]''p{Mn}+|['u0627'u0625'u0622'u0623])',
"'u0627":'(['u0627'u0625'u0622'u0623]''p{Mn}+|['u0627'u0625'u0622'u0623])',
"'u0625":'(['u0627'u0625'u0622'u0623]''p{Mn}+|['u0627'u0625'u0622'u0623])',
"'u0622":'(['u0627'u0625'u0622'u0623]''p{Mn}+|['u0627'u0625'u0622'u0623])',
"'u06D6":'('u06D6''p{Mn}+|'u06D6)',
"'u0628":'('u0628''p{Mn}+|'u0628)',
"'u062A":'('u062A''p{Mn}+|'u062A)',
"'u062B":'('u062B''p{Mn}+|'u062B)',
"'u062C":'('u062C''p{Mn}+|'u062C)',
"'u062D":'('u062D''p{Mn}+|'u062D)',
"'u062E":'('u062E''p{Mn}+|'u062E)', 
"'u062F":'('u062F''p{Mn}+|'u062F)',
"'u0630":'('u0630''p{Mn}+|'u0630)',
"'u0631":'('u0631''p{Mn}+|'u0631)',
"'u0632":'('u0632''p{Mn}+|'u0632)',
"'u0633":'('u0633''p{Mn}+|'u0633)',
"'u0634":'('u0634''p{Mn}+|'u0634)',
"'u0635":'('u0635''p{Mn}+|'u0635)',
"'u0636":'('u0636''p{Mn}+|'u0636)',
"'u0637":'('u0637''p{Mn}+|'u0637)',
"'u0638":'('u0638''p{Mn}+|'u0638)',
"'u0639":'('u0639''p{Mn}+|'u0639)',
"'u063A":'('u063A''p{Mn}+|'u063A)',
"'u0641":'('u0641''p{Mn}+|'u0641)',
"'u0642":'('u0642''p{Mn}+|'u0642)',
"'u0643":'('u0643''p{Mn}+|'u0643)',
"'u0644":'('u0644''p{Mn}+|'u0644)',
"'u0645":'('u0645''p{Mn}+|'u0645)',
"'u0646":'('u0646''p{Mn}+|'u0646)',
"'u0647":'('u0647''p{Mn}+|'u0647)',
"'u064A":'('u064A''p{Mn}+|'u064A)',
"'u0649":'('u0649''p{Mn}+|'u0649)',
"'u0626":'('u0626''p{Mn}+|'u0626)',
"'u0621":'('u0621''p{Mn}+|'u0621)',
"'u0629":'('u0629''p{Mn}+|'u0629)',
"'u0648":'(['u0624'u0648]''p{Mn}+|['u0624'u0648])',
"'u0624":'(['u0624'u0648]''p{Mn}+|['u0624'u0648])',
"'u0640":'(['u0640]''p{Mn}+|['u0640])',
  // for other languages
'A':'[Aa'xaa'xc0'xc5'xe0'xe5'u0100'u0105'u01cd'u01ce'u0200'u0203'u0226'u0227'u1d2c'u1d43'u1e00'u1e01'u1e9a'u1ea0'u1ea3'u2090'u2100'u2101'u213b'u249c'u24b6'u24d0'u3371'u3374'u3380'u3384'u3388'u3389'u33a9'u33af'u33c2'u33ca'u33df'u33ff'uff21'uff41]',
'B' : '[Bb'u1d2e'u1d47'u1e02-'u1e07'u212c'u249d'u24b7'u24d1'u3374'u3385-'u3387'u33c3'u33c8'u33d4'u33dd'uff22'uff42]',
'C' : '[Cc'xc7'xe7'u0106-'u010d'u1d9c'u2100'u2102'u2103'u2105'u2106'u212d'u216d'u217d'u249e'u24b8'u24d2'u3376'u3388'u3389'u339d'u33a0'u33a4'u33c4-'u33c7'uff23'uff43]',
'D' : '[Dd'u010e'u010f'u01c4-'u01c6'u01f1-'u01f3'u1d30'u1d48'u1e0a-'u1e13'u2145'u2146'u216e'u217e'u249f'u24b9'u24d3'u32cf'u3372'u3377-'u3379'u3397'u33ad-'u33af'u33c5'u33c8'uff24'uff44]',
'E' : '[Ee'xc8-'xcb'xe8-'xeb'u0112-'u011b'u0204-'u0207'u0228'u0229'u1d31'u1d49'u1e18-'u1e1b'u1eb8-'u1ebd'u2091'u2121'u212f'u2130'u2147'u24a0'u24ba'u24d4'u3250'u32cd'u32ce'uff25'uff45]',
'F' : '[Ff'u1da0'u1e1e'u1e1f'u2109'u2131'u213b'u24a1'u24bb'u24d5'u338a-'u338c'u3399'ufb00-'ufb04'uff26'uff46]',
'G' : '[Gg'u011c-'u0123'u01e6'u01e7'u01f4'u01f5'u1d33'u1d4d'u1e20'u1e21'u210a'u24a2'u24bc'u24d6'u32cc'u32cd'u3387'u338d-'u338f'u3393'u33ac'u33c6'u33c9'u33d2'u33ff'uff27'uff47]',
'H' : '[Hh'u0124'u0125'u021e'u021f'u02b0'u1d34'u1e22-'u1e2b'u1e96'u210b-'u210e'u24a3'u24bd'u24d7'u32cc'u3371'u3390-'u3394'u33ca'u33cb'u33d7'uff28'uff48]',
'I' : '[Ii'xcc-'xcf'xec-'xef'u0128-'u0130'u0132'u0133'u01cf'u01d0'u0208-'u020b'u1d35'u1d62'u1e2c'u1e2d'u1ec8-'u1ecb'u2071'u2110'u2111'u2139'u2148'u2160-'u2163'u2165-'u2168'u216a'u216b'u2170-'u2173'u2175-'u2178'u217a'u217b'u24a4'u24be'u24d8'u337a'u33cc'u33d5'ufb01'ufb03'uff29'uff49]',
'J' : '[Jj'u0132-'u0135'u01c7-'u01cc'u01f0'u02b2'u1d36'u2149'u24a5'u24bf'u24d9'u2c7c'uff2a'uff4a]',
'K' : '[Kk'u0136'u0137'u01e8'u01e9'u1d37'u1d4f'u1e30-'u1e35'u212a'u24a6'u24c0'u24da'u3384'u3385'u3389'u338f'u3391'u3398'u339e'u33a2'u33a6'u33aa'u33b8'u33be'u33c0'u33c6'u33cd-'u33cf'uff2b'uff4b]',
'L' : '[Ll'u0139-'u0140'u01c7-'u01c9'u02e1'u1d38'u1e36'u1e37'u1e3a-'u1e3d'u2112'u2113'u2121'u216c'u217c'u24a7'u24c1'u24db'u32cf'u3388'u3389'u33d0-'u33d3'u33d5'u33d6'u33ff'ufb02'ufb04'uff2c'uff4c]',
'M' : '[Mm'u1d39'u1d50'u1e3e-'u1e43'u2120'u2122'u2133'u216f'u217f'u24a8'u24c2'u24dc'u3377-'u3379'u3383'u3386'u338e'u3392'u3396'u3399-'u33a8'u33ab'u33b3'u33b7'u33b9'u33bd'u33bf'u33c1'u33c2'u33ce'u33d0'u33d4-'u33d6'u33d8'u33d9'u33de'u33df'uff2d'uff4d]',
'N' : '[Nn'xd1'xf1'u0143-'u0149'u01ca-'u01cc'u01f8'u01f9'u1d3a'u1e44-'u1e4b'u207f'u2115'u2116'u24a9'u24c3'u24dd'u3381'u338b'u339a'u33b1'u33b5'u33bb'u33cc'u33d1'uff2e'uff4e]',
'O' : '[Oo'xba'xd2-'xd6'xf2-'xf6'u014c-'u0151'u01a0'u01a1'u01d1'u01d2'u01ea'u01eb'u020c-'u020f'u022e'u022f'u1d3c'u1d52'u1ecc-'u1ecf'u2092'u2105'u2116'u2134'u24aa'u24c4'u24de'u3375'u33c7'u33d2'u33d6'uff2f'uff4f]',
'P' : '[Pp'u1d3e'u1d56'u1e54-'u1e57'u2119'u24ab'u24c5'u24df'u3250'u3371'u3376'u3380'u338a'u33a9-'u33ac'u33b0'u33b4'u33ba'u33cb'u33d7-'u33da'uff30'uff50]',
'Q' : '[Qq'u211a'u24ac'u24c6'u24e0'u33c3'uff31'uff51]',
'R' : '[Rr'u0154-'u0159'u0210-'u0213'u02b3'u1d3f'u1d63'u1e58-'u1e5b'u1e5e'u1e5f'u20a8'u211b-'u211d'u24ad'u24c7'u24e1'u32cd'u3374'u33ad-'u33af'u33da'u33db'uff32'uff52]',
'S' : '[Ss'u015a-'u0161'u017f'u0218'u0219'u02e2'u1e60-'u1e63'u20a8'u2101'u2120'u24ae'u24c8'u24e2'u33a7'u33a8'u33ae-'u33b3'u33db'u33dc'ufb06'uff33'uff53]',
'T' : '[Tt'u0162-'u0165'u021a'u021b'u1d40'u1d57'u1e6a-'u1e71'u1e97'u2121'u2122'u24af'u24c9'u24e3'u3250'u32cf'u3394'u33cf'ufb05'ufb06'uff34'uff54]',
'U' : '[Uu'xd9-'xdc'xf9-'xfc'u0168-'u0173'u01af'u01b0'u01d3'u01d4'u0214-'u0217'u1d41'u1d58'u1d64'u1e72-'u1e77'u1ee4-'u1ee7'u2106'u24b0'u24ca'u24e4'u3373'u337a'uff35'uff55]',
'V' : '[Vv'u1d5b'u1d65'u1e7c-'u1e7f'u2163-'u2167'u2173-'u2177'u24b1'u24cb'u24e5'u2c7d'u32ce'u3375'u33b4-'u33b9'u33dc'u33de'uff36'uff56]',
'W' : '[Ww'u0174'u0175'u02b7'u1d42'u1e80-'u1e89'u1e98'u24b2'u24cc'u24e6'u33ba-'u33bf'u33dd'uff37'uff57]',
'X' : '[Xx'u02e3'u1e8a-'u1e8d'u2093'u213b'u2168-'u216b'u2178-'u217b'u24b3'u24cd'u24e7'u33d3'uff38'uff58]',
'Y' : '[Yy'xdd'xfd'xff'u0176-'u0178'u0232'u0233'u02b8'u1e8e'u1e8f'u1e99'u1ef2-'u1ef9'u24b4'u24ce'u24e8'u33c9'uff39'uff59]',
'Z' : '[Zz'u0179-'u017e'u01f1-'u01f3'u1dbb'u1e90-'u1e95'u2124'u2128'u24b5'u24cf'u24e9'u3390-'u3394'uff3a'uff5a]'
};

然后我改进了脚本,发现字符串包含单词之间的符号,只忽略整个字符串的开头和结尾:

例如:

如果我键入:"你好世界"

结果会发现:

";你好世界"

";你好<gt;:;'".,#%^@)(世界"

使用此代码:

 var inputPattern = XRegExp('('+input.split(' ').join('[''s''p{P}''p{S}''p{m}]+')+')',"g");

p{p}:将查找所有标点

p{S}:将查找所有符号

p{m}:将找到所有变音符号

剩下的最后一件事是找出答案,让剧本能够找到单词包含一个特殊字符"",该字符用于某些quranic阿拉伯语文本

例如:

بõماللهبين

当添加""时,它将是:

ب

我不知道怎么做。

查看脚本的实际工作方式:

演示。


借助trincot解决这是最终结果:

演示。

您可以在每个非空格字符后添加以下regex模式:'u0640*。它将允许任何数量的tatweel字符出现在任何非空格字符之后。

您已经使用以下调用对非空格字符执行了匹配:

var regexp = cleanString.replace(/'S/g, accentReplacer);

因此,函数accentReplacer可能是应用这种修改的最佳位置:

var accentReplacer = function(character) {
    var h = charToAccentedCharClassMap[character] || character;
    return h + ''u0640*';
};