本文整理汇总了Python中pyarabic.araby.stripTashkeel函数的典型用法代码示例。如果您正苦于以下问题:Python stripTashkeel函数的具体用法?Python stripTashkeel怎么用?Python stripTashkeel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stripTashkeel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: check_normalized
def check_normalized(self, word_vocalised, resulted_data):
"""
If the entred word is like the found word in dictionary, to treat some normalized cases,
the analyzer return the vocalized like words;
ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
this function filter normalized resulted word according the given word, and give ذئب.
@param word_vocalised: the input word.
@type word_vocalised: unicode.
@param resulted_data: the founded resulat from dictionary.
@type resulted_data: list of dict.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
# print word_vocalised.encode('utf8');
filtred_data = []
inputword = araby.stripTashkeel(word_vocalised)
for item in resulted_data:
if "vocalized" in item.__dict__: # .has_key('vocalized') :
# ~ if 'vocalized' in item :
# ~ outputword = araby.stripTashkeel(item['vocalized'])
outputword = araby.stripTashkeel(item.__dict__["vocalized"])
# print u'\t'.join([inputword, outputword]).encode('utf8');
if inputword == outputword:
# item['tags']+=':a';
filtred_data.append(item)
return filtred_data
示例2: getStemVariants
def getStemVariants(self,stem,prefix,suffix):
"""
Generate the Noun stem variants according to the affixes.
For example مدرستي=>مدرست+ي => مدرسة +ي.
Return a list of possible cases.
@param stem: the input stem.
@type stem: unicode.
@param prefix: prefixe.
@type prefix: unicode.
@param suffix: suffixe.
@type suffix: unicode.
@return: list of stem variants.
@rtype: list of unicode.
"""
#some cases must have some correction
#determinate the prefix and suffix types
# create a list, the first item is the verb without changes
prefix_possible_noun_list= set([stem])
# Prefix
prefix=araby.stripTashkeel(prefix);
suffix=araby.stripTashkeel(suffix);
possible_noun_list=prefix_possible_noun_list;
if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
possible_noun=stem+araby.TEH_MARBUTA;
possible_noun_list.add(possible_noun)
if suffix=="" or suffix==araby.YEH+araby.NOON or suffix==araby.WAW+araby.NOON:
possible_noun=stem+araby.YEH;
possible_noun_list.add(possible_noun)
if stem.endswith(araby.YEH):
possible_noun=stem[:-1]+araby.ALEF_MAKSURA;
possible_noun_list.add(possible_noun)
#to be validated
validated_list=possible_noun_list;
return validated_list
示例3: isPossibleCollocation
def isPossibleCollocation(self, list2, context="", lenght=2):
"""
Guess if the given list is a possible collocation
This is used to collect unkown collocations, from user input
return True oor false
@param wordlist: word of list, 2 or more words.
@type wordlist: list of unicode.
@param lenght: minimum number of words in the collocation
@type lenght: integer.
@return : the rule of found collocation, 100 default.
@rtype: interger.
"""
if len(list2) < lenght:
return 0
else:
itemV1 = list2[0]
itemV2 = list2[1]
item1 = araby.stripTashkeel(itemV1)
item2 = araby.stripTashkeel(itemV2)
# if item1[-1:] in (u".",u"?",u",",u'[', u']',u'(',')'):
# return 0;
if not collocation_const.token_pat.search(item1) or not collocation_const.token_pat.search(item2):
return -1
# else: return 100;
elif item1 in collocation_const.ADDITIONAL_WORDS:
return 10
elif item1 in collocation_const.NAMED_PRIOR:
return 15
elif item2 not in collocation_const.SPECIAL_DEFINED:
if item2.startswith(u"ال") and item1.startswith(
u"ال"
): # re.search(ur'^(ال|بال|وبال|فال|وال|لل|كال|فكال|ولل|فلل|فبال)', item1):
return 20
elif item1.endswith(u"ة") and item2.startswith(u"ال"):
return 30
# حالة الكلمات التي تبدأ بلام الجر والتعريف
# لا داعي لها لأنها دائما مجرورة
# if item2.startswith(u'لل'):
# return 40;
elif item1.endswith(u"ة") and item2.endswith(u"ة"):
return 40
# if item1.endswith(u'ي') and item2.endswith(u'ي'):
# return 60;
elif context != u"" and context in collocation_const.tab_noun_context and item2.startswith(u"ال"):
return 50
# return True;
elif item1.endswith(u"ات") and item2.startswith(u"ال"):
return 60
return 100
示例4: getSuffixVariant
def getSuffixVariant(self, word, suffix, enclitic):
"""
Get the suffix variant to be joined to the word.
For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
@param word: word found in dictionary.
@type word: unicode.
@param suffix: second level suffix.
@type suffix: unicode.
@param enclitic: first level suffix.
@type enclitic: unicode.
@return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark).
@rtype: (unicode, unicode)
"""
enclitic_nm=araby.stripTashkeel(enclitic)
newSuffix =suffix; #default value
#if the word ends by a haraka
if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix);
elif not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix):
newSuffix=u"";
#gererate the suffix without I'rab short mark
# here we lookup with given suffix because the new suffix is changed and can be not found in table
if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
suffixNonIrabMark =araby.stripLastHaraka(newSuffix);
else:
suffixNonIrabMark = newSuffix
return newSuffix, suffixNonIrabMark ;
示例5: Comparetashkeel
def Comparetashkeel(text):
import tashkeel.tashkeel as ArabicVocalizer
# the entred text is vocalized correctly
correct_text=text;
text=araby.stripTashkeel(text);
vocalizer=ArabicVocalizer.TashkeelClass();
vocalized_text=vocalizer.tashkeel(text);
# compare voalized text with a correct text
text1=correct_text;
text2=vocalized_text;
# remove collocations symboles
text2=text2.replace("'","");
text2=text2.replace("~","");
#stemmer=tashaphyne.stemming.ArabicLightStemmer()
list1=vocalizer.analyzer.tokenize(text1);
list2=vocalizer.analyzer.tokenize(text2);
print u":".join(list1).encode('utf8');
print u":".join(list2).encode('utf8');
correct=0;
incorrect=0;
total=len(list1);
if len(list1)!=len(list2):
print "lists haven't the same length";
else:
for i in range(total):
if araby.vocalizedlike(list1[i],list2[i]):
correct+=1;
else:
incorrect+=1;
result=[vocalized_text,"correct:%0.2f%%"%round(correct*100.00/total,2),"incorrect:%0.2f%%"%round(incorrect*100.00/total,2),total]
return result#correct*100/total;
示例6: generate_possible_conjug
def generate_possible_conjug(self, infinitive_verb, unstemed_verb , affix, future_type=araby.FATHA, externPrefix="-", externSuffix="-", transitive=True):
"""
"""
## future_type=FATHA;
#~ transitive=True;
list_correct_conj=[];
if infinitive_verb=="" or unstemed_verb=="" or affix=="":
return set();
verb = infinitive_verb;
future_type = libqutrub.ar_verb.get_future_type_entree(future_type);
#print u"\t".join([verb, future_type]).encode('utf8');
vb = libqutrub.classverb.verbclass(verb, transitive, future_type);
# الألف ليست جزءا من السابقة، لأنها تستعمل لمنع الابتداء بساكن
# وتصريف الفعل في الامر يولده
if affix.startswith(araby.ALEF): affix=affix[1:]
# get all tenses to conjugate the verb one time
tenses=[];
if stem_verb_const.Table_affix.has_key(affix):
for pair in stem_verb_const.Table_affix[affix]:
tenses.append(pair[0]);#tense=pair[0]
tenses=list(set(tenses)); # avoid duplicata
if stem_verb_const.Table_affix.has_key(affix):
for pair in stem_verb_const.Table_affix[affix]:
tense=pair[0]
pronoun=pair[1]
if self.is_compatible_proaffix_tense(externPrefix, externSuffix, tense, pronoun, transitive):
conj_vocalized = vb.conjugateTenseForPronoun( tense, pronoun)
#strip all marks and shadda
conj_nm = araby.stripTashkeel(conj_vocalized);
if conj_nm==unstemed_verb:
list_correct_conj.append({'verb':infinitive_verb, 'tense':tense, 'pronoun':pronoun, 'vocalized':conj_vocalized, 'unvocalized':conj_nm});
return list_correct_conj;
示例7: getWordVariant
def getWordVariant(self, word, suffix):
"""
Get the word variant to be joined to the suffix.
For example: word = مدرسة, suffix=ي. The word is converted to مدرست.
@param word: word found in dictionary.
@type word: unicode.
@param suffix: suffix ( firts or second level).
@type suffix: unicode.
@return: variant of word.
@rtype: unicode.
"""
word_stem=word;
# print word.encode('utf8');
#HARAKAT=(FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
suffix_nm=araby.stripTashkeel(suffix)
#if the word ends by a haraka
word_stem=araby.stripLastHaraka(word_stem);
if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
word_stem=word_stem[:-1];
elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"":
word_stem=word_stem[:-1]+araby.TEH;
elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"":
word_stem = word_stem[:-1]+araby.YEH;
elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"":
if suffix.startswith(araby.DAMMA):
word_stem = word_stem[:-1] + araby.WAW_HAMZA;
elif suffix.startswith(araby.KASRA):
word_stem = word_stem[:-1] + araby.YEH_HAMZA;
return word_stem;
示例8: check_word
def check_word(self,word, guessedTag=""):
"""
Analyze one word morphologically as verbs
@param word: the input word.
@type word: unicode.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
word=araby.stripTatweel(word);
word_vocalised=word;
word_nm=araby.stripTashkeel(word);
resulted_text=u"";
resulted_data=[];
# if word is a pounctuation
resulted_data+=self.check_word_as_pounct(word_nm);
# Done: if the word is a stop word we have some problems,
# the stop word can also be another normal word (verb or noun),
# we must consider it in future works
# if word is stopword allow stop words analysis
resulted_data+=self.check_word_as_stopword(word_nm);
#if word is verb
# مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
resulted_data+=self.check_word_as_verb(word_nm);
#print "is verb", rabti,len(resulted_data);
#if word is noun
if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
resulted_data+=self.check_word_as_noun(word_nm);
if len(resulted_data)==0:
#check the word as unkonwn
resulted_data+=self.check_word_as_unknown(word_nm);
#check if the word is nomralized and solution are equivalent
resulted_data = self.check_normalized(word_vocalised, resulted_data)
#check if the word is shadda like
resulted_data = self.check_shadda(word_vocalised, resulted_data)
#check if the word is vocalized like results
if self.partial_vocalization_support:
resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data);
# add word frequency information in tags
resulted_data = self.addWordFrequency(resulted_data);
if len(resulted_data)==0:
resulted_data.append(wordCase.wordCase({
'word':word,
'affix': ('' , '', '', ''),
'stem':'',
'original':word,
'vocalized':word,
'tags':u'',
'type':'unknown',
'root':'',
'template':'',
'freq':self.wordfreq.getFreq(word, 'unknown'),
'syntax':'',
})
);
return resulted_data;
示例9: setVocalized
def setVocalized(self,newvocalized):
"""
Set the vocalized word
@param newvocalized: the new given vocalized.
@type newvocalized: unicode string
"""
self.vocalized = newvocalized;
self.unvocalized = araby.stripTashkeel(newvocalized);
示例10: add
def add(self, word, suggestList):
if word!=u"" and suggestList!=[] and type(suggestList).__name__=='list':
#ToDo: adding different suggestion into one list;
# NB: this is time eater because if the word is frequent.
# if self.dict.has_key(word):
# # if the dict has previous suggestions for the word,
# # add new suggestions and remove duplicata;
# suggestList+=self.dict[word];
# suggestList=set(suggestList);
# self.dict[word]=suggestList;
#else:
self.dict[araby.stripTashkeel(word)]=suggestList;
示例11: vocalizeNamed
def vocalizeNamed(wordlist, synTags=""):
""" Vocalize a number words
@param wordlist: words to vocalize
@type wordlist: unicode list
@param synTags: tags about the clause
@type synTags: unicode
@return: the vocalized wordlist.
@rtype: unicode
"""
newlist=[];
prefix=u"";
next=u"";
#detect tags
# we can pass tags to this number word
tags= synTags;
bin_count=0;
for i in range(len(wordlist)):
#save the original word with possible harakat if exist
word=wordlist[i];
word_nm=araby.stripTashkeel(word);
# the first word can have prefixes
if i==0 and word_nm:
# word to get majrour tag
if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
tags +=u"مجرور";
elif word_nm in (u'أبو', ):
tags +=u"مرفوع";
elif word_nm in (u'أبا', ):
tags +=u"منصوب";
# select vocalization
if word_nm==u'بن':
bin_count+=1;
#treat first bin according to tags
if bin_count==1:
if u'مجرور' in tags:
voc=u'بْنِ'
elif u'مرفوع' in tags:
voc=u'بْنُ'
elif u'منصوب' in tags:
voc=u'بْنَ'
else:
voc=u'بْن'
else:
# u'مجرور'
voc=u'بْنِ'
#Todo Vocalize names
else:
voc=word;
newlist.append(voc);
return newlist;
示例12: getUnvOriginal
def getUnvOriginal(self,):
"""
Get the unvocalized original form of the input word
@return: the given unvocalized original.
@rtype: unicode string
"""
if self.unvoriginal:
return self.unvoriginal;
else :
if self.original:
self.unvoriginal = araby.stripTashkeel(self.original);
else:
return u"";
return self.unvoriginal;
示例13: getUnvocalized
def getUnvocalized(self,):
"""
Get the unvocalized form of the input word
@return: the given unvocalized.
@rtype: unicode string
"""
if self.unvocalized:
return self.unvocalized;
else:
if self.vocalized:
self.unvocalized=araby.stripTashkeel(self.vocalized);
else :
return u"";
return self.unvocalized;
示例14: check_normalized
def check_normalized(self, word_vocalised, resulted_data):
"""
If the entred word is like the found word in dictionary, to treat some normalized cases,
the analyzer return the vocalized like words;
ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
this function filter normalized resulted word according the given word, and give ذئب.
@param word_vocalised: the input word.
@type word_vocalised: unicode.
@param resulted_data: the founded resulat from dictionary.
@type resulted_data: list of dict.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
#print word_vocalised.encode('utf8');
filtred_data=[];
inputword = araby.stripTashkeel(word_vocalised)
for item in resulted_data:
vocalized = getattr(item, 'vocalized')
if vocalized:
outputword = araby.stripTashkeel(vocalized)
if inputword == outputword:
filtred_data.append(item);
return filtred_data;
示例15: create_index_broken_plural
def create_index_broken_plural(self):
"""Deprecated: create index from the broken_plural dictionary
to accelerate the search in the dictionary for broken_plural
"""
for key in BrokenPluralTable.keys():
vocnoun = key
unvnoun = araby.stripTashkeel(vocnoun)
normnoun = normalize_hamza(unvnoun)
# transitive=BrokenPluralTable[key]
# stamp=noun_stamp(normnoun);
if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun):
self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun)
else:
self.BROKENPLURAL_DICTIONARY_INDEX[normnoun] = [vocnoun]