当前位置: 首页>>代码示例>>Python>>正文


Python araby.stripTashkeel函数代码示例

本文整理汇总了Python中pyarabic.araby.stripTashkeel函数的典型用法代码示例。如果您正苦于以下问题:Python stripTashkeel函数的具体用法?Python stripTashkeel怎么用?Python stripTashkeel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了stripTashkeel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: check_normalized

    def check_normalized(self, word_vocalised, resulted_data):
        """
		If the entred word is like the found word in dictionary, to treat some normalized cases, 
		the analyzer return the vocalized like words;
		ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
		this function filter normalized resulted word according the given word, and give ذئب.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        # print word_vocalised.encode('utf8');
        filtred_data = []
        inputword = araby.stripTashkeel(word_vocalised)
        for item in resulted_data:
            if "vocalized" in item.__dict__:  # .has_key('vocalized') :
                # ~ if 'vocalized' in item :
                # ~ outputword = araby.stripTashkeel(item['vocalized'])
                outputword = araby.stripTashkeel(item.__dict__["vocalized"])
                # print u'\t'.join([inputword, outputword]).encode('utf8');
                if inputword == outputword:
                    # item['tags']+=':a';
                    filtred_data.append(item)
        return filtred_data
开发者ID:ATouhou,项目名称:mishkal,代码行数:26,代码来源:analex.py

示例2: getStemVariants

	def getStemVariants(self,stem,prefix,suffix):
		"""
		Generate the Noun stem variants according to the affixes.
		For example مدرستي=>مدرست+ي => مدرسة +ي.
		Return a list of possible cases.
		@param stem: the input stem.
		@type stem: unicode.
		@param prefix: prefixe.
		@type prefix: unicode.
		@param suffix: suffixe.
		@type suffix: unicode.
		@return: list of stem variants.
		@rtype: list of unicode.
		"""
		#some cases must have some correction
		#determinate the prefix and suffix types
		# create a list, the first item is the verb without changes
		prefix_possible_noun_list= set([stem])
		# Prefix
		prefix=araby.stripTashkeel(prefix);
		suffix=araby.stripTashkeel(suffix);
		possible_noun_list=prefix_possible_noun_list;
		if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
			possible_noun=stem+araby.TEH_MARBUTA;
			possible_noun_list.add(possible_noun)
		if suffix=="" or suffix==araby.YEH+araby.NOON or suffix==araby.WAW+araby.NOON:
			possible_noun=stem+araby.YEH;
			possible_noun_list.add(possible_noun)
		if stem.endswith(araby.YEH):
			possible_noun=stem[:-1]+araby.ALEF_MAKSURA;
			possible_noun_list.add(possible_noun)
		#to be validated
		validated_list=possible_noun_list;
		return validated_list
开发者ID:ATouhou,项目名称:mishkal,代码行数:34,代码来源:stem_unknown.py

示例3: isPossibleCollocation

    def isPossibleCollocation(self, list2, context="", lenght=2):
        """
		Guess if the given list is a possible collocation
		This is used to collect unkown collocations, from user input
		return True oor false
		@param wordlist: word of list, 2 or more words.
		@type wordlist: list of unicode.
		@param lenght: minimum number of words in the collocation
		@type lenght: integer.		
		@return : the rule of found collocation, 100 default.
		@rtype: interger.
		"""
        if len(list2) < lenght:
            return 0
        else:
            itemV1 = list2[0]
            itemV2 = list2[1]
            item1 = araby.stripTashkeel(itemV1)
            item2 = araby.stripTashkeel(itemV2)
            # if item1[-1:] in (u".",u"?",u",",u'[', u']',u'(',')'):
            # 	return 0;
            if not collocation_const.token_pat.search(item1) or not collocation_const.token_pat.search(item2):
                return -1
                # else: return 100;
            elif item1 in collocation_const.ADDITIONAL_WORDS:
                return 10
            elif item1 in collocation_const.NAMED_PRIOR:
                return 15
            elif item2 not in collocation_const.SPECIAL_DEFINED:
                if item2.startswith(u"ال") and item1.startswith(
                    u"ال"
                ):  # re.search(ur'^(ال|بال|وبال|فال|وال|لل|كال|فكال|ولل|فلل|فبال)', item1):
                    return 20
                elif item1.endswith(u"ة") and item2.startswith(u"ال"):
                    return 30

                    # حالة الكلمات التي تبدأ بلام الجر والتعريف
                    # لا داعي لها لأنها دائما مجرورة
                    # if  item2.startswith(u'لل'):
                    # 	return 40;
                elif item1.endswith(u"ة") and item2.endswith(u"ة"):
                    return 40
                    # if item1.endswith(u'ي') and item2.endswith(u'ي'):
                    # 	return 60;

                elif context != u"" and context in collocation_const.tab_noun_context and item2.startswith(u"ال"):
                    return 50
                    # return True;

                elif item1.endswith(u"ات") and item2.startswith(u"ال"):
                    return 60
            return 100
开发者ID:ATouhou,项目名称:mishkal,代码行数:52,代码来源:collocations.py

示例4: getSuffixVariant

	def getSuffixVariant(self, word, suffix, enclitic):
		"""
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
		enclitic_nm=araby.stripTashkeel(enclitic)
		newSuffix =suffix; #default value
		#if the word ends by a haraka
		if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
			newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix);
		elif 	not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix):
			newSuffix=u"";
		#gererate the suffix without I'rab short mark
		# here we lookup with given suffix because the new suffix is changed and can be not found in table
		if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
			suffixNonIrabMark =araby.stripLastHaraka(newSuffix);
		else:
			suffixNonIrabMark = newSuffix
		return newSuffix, suffixNonIrabMark ;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:27,代码来源:stem_noun.py

示例5: Comparetashkeel

def Comparetashkeel(text):
	import tashkeel.tashkeel as ArabicVocalizer
	# the entred text is vocalized correctly
	correct_text=text;
	text=araby.stripTashkeel(text);
	vocalizer=ArabicVocalizer.TashkeelClass();
	vocalized_text=vocalizer.tashkeel(text);
	
	# compare voalized text with a correct text
	text1=correct_text;
	text2=vocalized_text;
	# remove collocations symboles
	text2=text2.replace("'","");
	text2=text2.replace("~","");
	
	#stemmer=tashaphyne.stemming.ArabicLightStemmer()
	list1=vocalizer.analyzer.tokenize(text1);
	list2=vocalizer.analyzer.tokenize(text2);
	print u":".join(list1).encode('utf8');
	print u":".join(list2).encode('utf8');
	correct=0;
	incorrect=0;
	total=len(list1);
	if len(list1)!=len(list2):
		print "lists haven't the same length";
	else:
		for i in range(total):
			if araby.vocalizedlike(list1[i],list2[i]):
				correct+=1;
			else:
				incorrect+=1;
	
	result=[vocalized_text,"correct:%0.2f%%"%round(correct*100.00/total,2),"incorrect:%0.2f%%"%round(incorrect*100.00/total,2),total]
	return result#correct*100/total;
开发者ID:ATouhou,项目名称:mishkal,代码行数:34,代码来源:adaat.py

示例6: generate_possible_conjug

	def generate_possible_conjug(self, infinitive_verb, unstemed_verb , affix, future_type=araby.FATHA, externPrefix="-", externSuffix="-", transitive=True):
		"""
		"""
	##    future_type=FATHA;
		#~ transitive=True;
		list_correct_conj=[];
		if infinitive_verb=="" or unstemed_verb=="" or affix=="":
			return set();
		verb = infinitive_verb;
		future_type = libqutrub.ar_verb.get_future_type_entree(future_type);
		#print u"\t".join([verb, future_type]).encode('utf8');
		vb = libqutrub.classverb.verbclass(verb, transitive, future_type);
		# الألف ليست جزءا من السابقة، لأنها تستعمل لمنع الابتداء بساكن
		# وتصريف الفعل في الامر يولده
		if affix.startswith(araby.ALEF): affix=affix[1:]
		# get all tenses to conjugate the verb one time
		tenses=[];
		if stem_verb_const.Table_affix.has_key(affix):
			for pair in stem_verb_const.Table_affix[affix]:
				tenses.append(pair[0]);#tense=pair[0]
		tenses=list(set(tenses)); # avoid duplicata 


		if stem_verb_const.Table_affix.has_key(affix):
			for pair in stem_verb_const.Table_affix[affix]:
				tense=pair[0]
				pronoun=pair[1]
				if self.is_compatible_proaffix_tense(externPrefix, externSuffix, tense, pronoun, transitive):

					conj_vocalized = vb.conjugateTenseForPronoun( tense, pronoun)
					#strip all marks and shadda
					conj_nm =  araby.stripTashkeel(conj_vocalized);
					if conj_nm==unstemed_verb:
						list_correct_conj.append({'verb':infinitive_verb, 'tense':tense, 'pronoun':pronoun, 'vocalized':conj_vocalized, 'unvocalized':conj_nm});
		return list_correct_conj;
开发者ID:ATouhou,项目名称:mishkal,代码行数:35,代码来源:stem_verb.py

示例7: getWordVariant

	def getWordVariant(self, word, suffix):
		"""
		Get the word variant to be joined to the suffix.
		For example: word = مدرسة, suffix=ي. The word is converted to مدرست.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: suffix ( firts or second level).
		@type suffix: unicode.
		@return: variant of word.
		@rtype: unicode.
		"""
		word_stem=word;
		# print word.encode('utf8');
		#HARAKAT=(FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
		suffix_nm=araby.stripTashkeel(suffix)
		#if the word ends by a haraka
		word_stem=araby.stripLastHaraka(word_stem);

		if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
			word_stem=word_stem[:-1];
		elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"":
			word_stem=word_stem[:-1]+araby.TEH;
		elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"":
			word_stem = word_stem[:-1]+araby.YEH;			
		elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"":
			if suffix.startswith(araby.DAMMA):
				word_stem = word_stem[:-1] + araby.WAW_HAMZA;
			elif suffix.startswith(araby.KASRA):
				word_stem = word_stem[:-1] + araby.YEH_HAMZA;
				
		return word_stem;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:31,代码来源:stem_noun.py

示例8: check_word

	def check_word(self,word, guessedTag=""):
		"""
		Analyze one word morphologically as verbs
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		word=araby.stripTatweel(word);
		word_vocalised=word;
		word_nm=araby.stripTashkeel(word);
		resulted_text=u"";
		resulted_data=[];
		# if word is a pounctuation
		resulted_data+=self.check_word_as_pounct(word_nm);
		# Done: if the word is a stop word we have  some problems,
		# the stop word can also be another normal word (verb or noun),
		# we must consider it in future works
		# if word is stopword allow stop words analysis
		resulted_data+=self.check_word_as_stopword(word_nm);

		#if word is verb
		# مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
		if  self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
			resulted_data+=self.check_word_as_verb(word_nm);
			#print "is verb", rabti,len(resulted_data);
		#if word is noun
		if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):			
			resulted_data+=self.check_word_as_noun(word_nm);
		if len(resulted_data)==0:
			#check the word as unkonwn
			resulted_data+=self.check_word_as_unknown(word_nm);
			#check if the word is nomralized and solution are equivalent
		resulted_data = self.check_normalized(word_vocalised, resulted_data)
		#check if the word is shadda like
		resulted_data = self.check_shadda(word_vocalised, resulted_data)

		#check if the word is vocalized like results			
		if self.partial_vocalization_support:
			resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data);
		# add word frequency information in tags
		resulted_data = self.addWordFrequency(resulted_data);

		if len(resulted_data)==0:
			resulted_data.append(wordCase.wordCase({
			'word':word,  
			'affix': ('' , '', '', ''),       
			'stem':'',
			'original':word,
			'vocalized':word,
			'tags':u'',
			'type':'unknown',
			'root':'',
			'template':'',
			'freq':self.wordfreq.getFreq(word, 'unknown'),
			'syntax':'',
			})
			);
		return resulted_data;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:59,代码来源:analex.py

示例9: setVocalized

	def setVocalized(self,newvocalized):
		"""
		Set the vocalized word
		@param newvocalized: the new given vocalized.
		@type newvocalized: unicode string
		"""
		self.vocalized  =  newvocalized;
		self.unvocalized  =  araby.stripTashkeel(newvocalized);
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:8,代码来源:stemmedword.py

示例10: add

	def add(self, word, suggestList):
		if word!=u"" and  suggestList!=[] and  type(suggestList).__name__=='list': 
			#ToDo: adding different suggestion into one list;
			# NB: this is time eater because if the word is frequent.
			# if self.dict.has_key(word):
				# # if the dict has previous suggestions for the word,
				# # add new suggestions and remove duplicata;
				# suggestList+=self.dict[word];
				# suggestList=set(suggestList);
				# self.dict[word]=suggestList;
			#else:
			self.dict[araby.stripTashkeel(word)]=suggestList;
开发者ID:Fahad-Alsaidi,项目名称:mishkal,代码行数:12,代码来源:spelling.py

示例11: vocalizeNamed

def vocalizeNamed(wordlist, synTags=""):
	""" Vocalize a number words
	@param wordlist: words to vocalize
	@type wordlist: unicode list
	@param synTags: tags about the clause
	@type synTags: unicode
	@return: the vocalized wordlist.
	@rtype: unicode
	"""
	newlist=[];
	prefix=u"";
	next=u"";
	#detect tags 
	# we can pass tags to this number word
	tags= synTags;
	bin_count=0;
	for i in range(len(wordlist)):
		#save the original word with possible harakat if exist
		word=wordlist[i];
		word_nm=araby.stripTashkeel(word);
		# the first word can have prefixes 
		if i==0 and word_nm:  
			# word to get majrour tag
			if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
				tags +=u"مجرور";
			elif word_nm in (u'أبو', ):
				tags +=u"مرفوع";
			elif word_nm in (u'أبا', ):
				tags +=u"منصوب";
		# select vocalization

		if word_nm==u'بن':
			bin_count+=1;
			#treat first bin according to tags
			if bin_count==1:
				if u'مجرور' in tags:
					voc=u'بْنِ'
				elif u'مرفوع' in tags:
					voc=u'بْنُ'
				elif u'منصوب' in tags:
					voc=u'بْنَ'
				else:
					voc=u'بْن'
			else:
				#  u'مجرور' 
				voc=u'بْنِ'
		#Todo Vocalize names
		else:
			voc=word;
		newlist.append(voc);
	return newlist;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:51,代码来源:named.py

示例12: getUnvOriginal

	def getUnvOriginal(self,):
		"""
		Get the unvocalized  original form of the input word
		@return: the given unvocalized original.
		@rtype: unicode string
		"""
		if self.unvoriginal:
			return self.unvoriginal;			
		else :
			if self.original:
				self.unvoriginal = araby.stripTashkeel(self.original);
			else:
				return u"";
			return self.unvoriginal;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:14,代码来源:stemmedsynword.py

示例13: getUnvocalized

	def getUnvocalized(self,):
		"""
		Get the unvocalized form of the input word
		@return: the given unvocalized.
		@rtype: unicode string
		"""
		if self.unvocalized:
			return self.unvocalized;
		else:
			if self.vocalized:
				self.unvocalized=araby.stripTashkeel(self.vocalized);
			else :
				return u"";
		return self.unvocalized;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:14,代码来源:stemmedsynword.py

示例14: check_normalized

	def check_normalized(self, word_vocalised, resulted_data):
		"""
		If the entred word is like the found word in dictionary, to treat some normalized cases, 
		the analyzer return the vocalized like words;
		ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
		this function filter normalized resulted word according the given word, and give ذئب.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
		#print word_vocalised.encode('utf8');
		filtred_data=[];
		inputword = araby.stripTashkeel(word_vocalised)
		for item in  resulted_data:
			vocalized = getattr(item, 'vocalized') 
			if vocalized:
				outputword = araby.stripTashkeel(vocalized)
				if inputword == outputword:
					filtred_data.append(item);
		return  filtred_data;
开发者ID:CompMusic,项目名称:ArabicTransliterator,代码行数:23,代码来源:analex.py

示例15: create_index_broken_plural

    def create_index_broken_plural(self):

        """Deprecated: create index from the broken_plural dictionary
		to accelerate the search in the dictionary for broken_plural
		"""
        for key in BrokenPluralTable.keys():
            vocnoun = key
            unvnoun = araby.stripTashkeel(vocnoun)
            normnoun = normalize_hamza(unvnoun)
            # transitive=BrokenPluralTable[key]
            # stamp=noun_stamp(normnoun);
            if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun):
                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun)
            else:
                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun] = [vocnoun]
开发者ID:ATouhou,项目名称:mishkal,代码行数:15,代码来源:stem_unknown.old.py


注:本文中的pyarabic.araby.stripTashkeel函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。