本文整理汇总了Python中pyarabic.araby.strip_tashkeel函数的典型用法代码示例。如果您正苦于以下问题:Python strip_tashkeel函数的具体用法?Python strip_tashkeel怎么用?Python strip_tashkeel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了strip_tashkeel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, result_dict = None, order = -1):
# ToDo
# copy the super class attributes to curesult_dictrrent classe
#stemmedword.stemmedWord.__init__(self, result_dict.get_dict())
if result_dict:
self.__dict__ = result_dict.__dict__.copy()
self.unvocalized = araby.strip_tashkeel(self.vocalized)
self.unvoriginal = araby.strip_tashkeel(self.original)
self.tag_verbal_factor = 0
self.tag_nominal_factor = 0
self.tag_kana_rafe3 = False
if self.is_verb():
self.tag_kana_rafe3 = self._is_kana_rafe3()
if self.is_stopword():
self.tag_kana_rafe3 = self._is_kana_rafe3()
self.tag_nominal_factor = self.__get_nominal_factor()
#verbal factor
self.tag_verbal_factor = self.__get_verbal_factor()
self.tag_addition = self._is_addition()
self.tag_break = self._is_break()
self.forced_word_case = False
self.syntax = u"" # used for syntaxique analysis porpos
self.semantic = u"" # used for semantic analysis porposes
self.forced_wordtype = False
self.order = order
self.next = {}
self.previous = {}
self.sem_next = {}
self.sem_previous = {}
self.score = 0
self.rule = 0 # rule used to select the current case in vocalization
示例2: check_normalized
def check_normalized(word_vocalised, resulted_data):
"""
If the entred word is like the found word in dictionary,
to treat some normalized cases,
the analyzer return the vocalized like words
ُIf the word is ذئب, the normalized form is ذءب,
which can give from dictionary ذئبـ ذؤب.
this function filter normalized resulted word according
the given word, and give ذئب.
@param word_vocalised: the input word.
@type word_vocalised: unicode.
@param resulted_data: the founded resulat from dictionary.
@type resulted_data: list of dict.
@return: list of dictionaries of analyzed words with tags.
@rtype: list.
"""
#print word_vocalised.encode('utf8')
filtred_data = []
inputword = araby.strip_tashkeel(word_vocalised)
for item in resulted_data:
if 'vocalized' in item.__dict__ : #.has_key('vocalized') :
#~ if 'vocalized' in item :
#~ outputword = araby.strip_tashkeel(item['vocalized'])
outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
#print u'\t'.join([inputword, outputword]).encode('utf8')
if inputword == outputword:
#item['tags'] += ':a'
filtred_data.append(item)
return filtred_data
示例3: get_word_variant
def get_word_variant(word, suffix, encletic):
"""
Get the word variant to be joined to the suffix.
For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
@param word: word found in dictionary.
@type word: unicode.
@param suffix: suffix ( first level).
@type suffix: unicode.
@param encletic: encletic( second level).
@type encletic: unicode.
@return: variant of word.
@rtype: unicode.
"""
word_stem = word
suffix_nm = araby.strip_tashkeel(suffix)
encletic_nm = araby.strip_tashkeel(encletic)
long_suffix_nm = suffix_nm + encletic_nm
#if the word ends by a haraka
word_stem = araby.strip_lastharaka(word_stem)
# الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية
if word_stem.endswith(araby.TEH_MARBUTA):
if suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,
araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
word_stem = word_stem[:-1]
# الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق
#مدرسة +ين = مدرستين
elif long_suffix_nm != u"":
word_stem = word_stem[:-1]+araby.TEH
elif word_stem.endswith(araby.ALEF_MAKSURA):
# الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء
# مستوى +ان = مستويان
# إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء
if suffix_nm != u"":
word_stem = word_stem[:-1]+araby.YEH
# إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل تتحول الألف المقصورة إلى ألف
elif encletic_nm != u"":
word_stem = word_stem[:-1]+araby.ALEF
elif word_stem.endswith(araby.KASRA + araby.YEH):
# الاسم المنقوص ينتهي بياء قبلها مكسور
# إذا كان لا ضمير واللاحقة فقط حركات
# نحذف ال
if not encletic_nm and not suffix_nm :
word_stem = word_stem[:-2]
#ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية
elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
if suffix.startswith(araby.DAMMA):
word_stem = word_stem[:-1] + araby.WAW_HAMZA
elif suffix.startswith(araby.KASRA):
word_stem = word_stem[:-1] + araby.YEH_HAMZA
elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
word_stem = word_stem[:-1] + araby.YEH_HAMZA
return word_stem
示例4: detect_number_phrases_position
def detect_number_phrases_position(wordlist):
"""
Detect number words in a text and return positions of each phrase.
Example:
>>> txt = u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا"
>>> wordlist = araby.tokenize(txt)
>>> positions_phrases = detect_number_phrases_position(wordlist)
>>> print positions_phrase
>>> print positions_phrases
[(1, 3), (6, 7)]
@param wordlist: wordlist
@type wordlist: unicode list
@return: list of numbers clause positions [(start,end),(start2,end2),]
@rtype: list of tuple
"""
#~ wordlist# = text.split(u' ')
#print words
phrases = []
startnumber = -1
endnumber = False
#~ taglist = []
for i, word in enumerate(wordlist):
#~ word = wordlist[i]
if i+1 < len(wordlist):
nextword = araby.strip_tashkeel(wordlist[i+1])
else: nextword = None
#save the original word with possible harakat if exist
word_nm = araby.strip_tashkeel(word)
key = word_nm
# the first word can have prefixes
if word_nm and not startnumber and word_nm != u'واحد' \
and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
key = word_nm[1:]
elif word_nm != u'واحد' and word_nm.startswith(u'و'):
key = word_nm[1:]
if key in nbconst.NUMBER_WORDS or key.isnumeric():
if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
u'اثنتا') or nextword in (u'عشر', u'عشرة'):
if startnumber < 0:
startnumber = i
endnumber = i
# phrase.append(word)
else:
if startnumber >= 0: #There are a previous number phrase.
phrases.append((startnumber, endnumber))
startnumber = -1
# add the final phrases
if startnumber >= 0: #There are a previous number phrase.
phrases.append((startnumber, endnumber))
return phrases
示例5: is_possible_collocation
def is_possible_collocation(self, list2, context = "", lenght = 2):
"""
Guess if the given list is a possible collocation
This is used to collect unkown collocations, from user input
return True oor false
@param wordlist: word of list, 2 or more words.
@type wordlist: list of unicode.
@param lenght: minimum number of words in the collocation
@type lenght: integer.
@return : the rule of found collocation, 100 default.
@rtype: interger.
"""
if len(list2)<lenght:
return 0
else:
item_v1 = list2[0]
item_v2 = list2[1]
item1 = araby.strip_tashkeel(item_v1)
item2 = araby.strip_tashkeel(item_v2)
#if item1[-1:] in (u".", u"?", u", ", u'[', u']', u'(', ')'):
# return 0
if not cconst.token_pat.search(item1) or not \
cconst.token_pat.search(item2) :
return -1
#else: return 100
elif item1 in cconst.ADDITIONAL_WORDS :
return 10
elif item1 in cconst.NAMED_PRIOR :
return 15
elif (item2 not in cconst.SPECIAL_DEFINED):
if item2.startswith(u'ال') and item1.startswith(u'ال'):
return 20
elif item1.endswith(u'ة') and item2.startswith(u'ال'):
return 30
#حالة الكلمات التي تبدأ بلام الجر والتعريف
# لا داعي لها لأنها دائما مجرورة
#if item2.startswith(u'لل'):
# return 40
elif item1.endswith(u'ة') and item2.endswith(u'ة') :
return 40
#if item1.endswith(u'ي') and item2.endswith(u'ي'):
# return 60
elif context != u"" and context in cconst.tab_noun_context \
and item2.startswith(u'ال') :
return 50
#return True
elif item1.endswith(u'ات') and item2.startswith(u'ال') :
return 60
return 100
示例6: detect_numbers
def detect_numbers(wordlist):
"""
Detect number words in a text and return a taglist as BIO.
Example:
>>> wordlist = araby.tokenize(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
>>> detect_numbers(wordlist)
['DO', 'DB', 'DI', 'DI', 'DO', 'DO', 'DB', 'DI', 'DO']
@param wordlist: wordlist
@type wordlist: unicode list
@return: list of tags BIO
@rtype: list of unicode
"""
#~ phrases = []
starts = False
taglist = []
for i, word in enumerate(wordlist):
#~ word = wordlist[i]
if i+1 < len(wordlist):
nextword = araby.strip_tashkeel(wordlist[i+1])
else:
nextword = None
#save the original word with possible harakat if exist
word_nm = araby.strip_tashkeel(word)
key = word_nm
# the first word can have prefixes
if word_nm and not starts and word_nm != u'واحد' \
and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
key = word_nm[1:]
elif word_nm != u'واحد' and word_nm.startswith(u'و'):
key = word_nm[1:]
if key in nbconst.NUMBER_WORDS or key.isnumeric():
if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
u'اثنتا') or nextword in (u'عشر', u'عشرة'):
if not starts:
taglist.append("DB")
starts = True
else:
taglist.append("DI")
else:
starts = False
taglist.append("O")
else:
starts = False
taglist.append("O")
return taglist
示例7: detect_number_words
def detect_number_words(text):
"""
Detect number words in a text.
@param text: input text
@type text: unicode
@return : number words extracted from text
@rtype: integer
>>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
خمسمئة وثلاثة وعشرين
"""
#~ words = araby.tokenize(text)
#print words
phrases_context = extract_number_context(text)
for ph_con in phrases_context:
if len(ph_con) >= 3:
previous = ph_con[0]
phrase = ph_con[1]
nextword = ph_con[2]
numberedwords = phrase
numeric = text2number(numberedwords)
tags = get_previous_tag(previous)
vocalized = vocalize_number(araby.strip_tashkeel(\
numberedwords).split(' '), tags)
#calcul vocalization similarity :
sim = araby.vocalized_similarity(numberedwords, vocalized)
voc_unit = vocalize_unit(numeric, nextword)
sim_unit = araby.vocalized_similarity(voc_unit, \
nextword)
if sim < 0:
print u'\t'.join([str(sim), numberedwords, vocalized, \
str(numeric), u' '.join([previous, phrase, nextword]), \
nextword, voc_unit, str(sim_unit)]).encode('utf8')
示例8: wordtag
def wordtag(text):
"""
word tagginginto noun, verb, tool
"""
import naftawayh.wordtag
tagger = naftawayh.wordtag.WordTagger()
word_list = token_text(text)
if len(word_list) == 0:
return []
else:
list_result = []
second_previous =""
previous = u""
#~previous_tag = ""
for word in word_list:
word_nm = araby.strip_tashkeel(word)
tag = ''
if tagger.is_stopword(word):
tag = 't'
else:
if tagger.is_noun(word):
tag += 'n'
if tagger.is_verb(word):
tag += 'v'
if tag in ("", "nv"):
tag = tagger.context_analyse(previous, word)+"1"
if tag in ("", "nv1", "vn1"):
tag = tagger.context_analyse(u" ".join([second_previous, previous]), word)+"2"
list_result.append({'word':word, 'tag': tag})
second_previous = previous
previous = word_nm
#~previous_tag = tag
return list_result
示例9: vocalize_named
def vocalize_named(wordlist, syn_tags = ""):
""" Vocalize a number words
@param wordlist: words to vocalize
@type wordlist: unicode list
@param syn_tags: tags about the clause
@type syn_tags: unicode
@return: the vocalized wordlist.
@rtype: unicode
"""
newlist = []
#~ prefix = u""
#~ nextword = u""
#detect tags
# we can pass tags to this number word
tags = syn_tags
bin_count = 0
for i in range(len(wordlist)):
#save the original word with possible harakat if exist
word = wordlist[i]
word_nm = araby.strip_tashkeel(word)
# the first word can have prefixes
if i == 0 and word_nm:
# word to get majrour tag
if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
tags += u"مجرور"
elif word_nm in (u'أبو', ):
tags += u"مرفوع"
elif word_nm in (u'أبا', ):
tags += u"منصوب"
示例10: get_word_variant
def get_word_variant(word, suffix):
"""
Get the word variant to be joined to the suffix.
For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
@param word: word found in dictionary.
@type word: unicode.
@param suffix: suffix ( firts or second level).
@type suffix: unicode.
@return: variant of word.
@rtype: unicode.
"""
word_stem = word
# print word.encode('utf8')
#HARAKAT = (FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN,
# KASRATAN, FATHATAN)
suffix_nm = araby.strip_tashkeel(suffix)
#if the word ends by a haraka
word_stem = araby.strip_lastharaka(word_stem)
if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (
araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,
araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
word_stem = word_stem[:-1]
elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"":
word_stem = word_stem[:-1]+araby.TEH
elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
word_stem = word_stem[:-1]+araby.YEH
elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
if suffix.startswith(araby.DAMMA):
word_stem = word_stem[:-1] + araby.WAW_HAMZA
elif suffix.startswith(araby.KASRA):
word_stem = word_stem[:-1] + araby.YEH_HAMZA
elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
word_stem = word_stem[:-1] + araby.YEH_HAMZA
return word_stem
示例11: get_suffix_variants
def get_suffix_variants(word, suffix, enclitic):
"""
Get the suffix variant to be joined to the word.
For example: word = مدرس, suffix = ة, encletic = ي.
The suffix is converted to Teh.
@param word: word found in dictionary.
@type word: unicode.
@param suffix: second level suffix.
@type suffix: unicode.
@param enclitic: first level suffix.
@type enclitic: unicode.
@return: variant of suffixes (vocalized suffix and vocalized
suffix without I'rab short mark).
@rtype: (unicode, unicode)
"""
enclitic_nm = araby.strip_tashkeel(enclitic)
newsuffix = suffix #default value
#if the word ends by a haraka
if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0:
newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)
elif not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix):
newsuffix = u""
#gererate the suffix without I'rab short mark
# here we lookup with given suffix because the new suffix is
# changed and can be not found in table
if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
else:
suffix_non_irab_mark = newsuffix
return newsuffix, suffix_non_irab_mark
示例12: get_word_variant
def get_word_variant(word, suffix):
"""
Get the word variant to be joined to the suffix.
For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
@param word: word found in dictionary.
@type word: unicode.
@param suffix: suffix ( firts or second level).
@type suffix: unicode.
@return: variant of word.
@rtype: unicode.
"""
word_stem = word
suffix_nm = araby.strip_tashkeel(suffix)
#if the word ends by a haraka strip the haraka if the suffix is not null
if suffix:
word_stem = araby.strip_lastharaka(word_stem)
if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
word_stem = word_stem[:-1]+araby.YEH
elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
if suffix.startswith(araby.DAMMA):
word_stem = word_stem[:-1] + araby.WAW_HAMZA
elif suffix.startswith(araby.KASRA):
word_stem = word_stem[:-1] + araby.YEH_HAMZA
return word_stem
示例13: search_arabic
def search_arabic(self, q, fetch_subgraph = True, limit = DEFAULT_LIMIT,
fetchplan = DEFAULT_FETCHPLAN):
"""
Searches for given label intelligently handling vocalization.
(This does not make much sense without a fetchplan as you will get
index nodes only.)
"""
# If query is not vocalized, search unvocalized index and eventually
# return subtree
if not araby.is_vocalized(q):
return self.search_index(q, fetch_subgraph,
"ArabicNode.unvocalized_label", limit,
fetchplan)
# If it is vocalized, search unvocalized index and check for
# "compatibility" of vocalization
matches = self.search_index(araby.strip_tashkeel(q), False,
"ArabicNode.unvocalized_label", limit)
rids = [n.rid for n in matches.primary_results
if Tools.is_vocalized_like(q, n.data["label"])]
# Ignore vocalization if there is no compatible one
if not rids:
rids = [n.rid for n in matches.primary_results]
return self.get_nodes(rids, fetch_subgraph, limit, fetchplan)
示例14: set_vocalized
def set_vocalized(self, newvocalized):
"""
Set the vocalized word
@param newvocalized: the new given vocalized.
@type newvocalized: unicode string
"""
self.vocalized = newvocalized
self.unvocalized = araby.strip_tashkeel(newvocalized)
示例15: detect_number_phrases_position
def detect_number_phrases_position(wordlist):
"""
Detect number words in a text and return positions of each phrase.
@param wordlist: wordlist
@type wordlist: unicode list
@return : list of numbers clause positions [(start,end),(start2,end2),]
@rtype: list of tuple
>>> detect_number_phrases_position(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
(1،3)، (6،7)
"""
#~ wordlist# = text.split(u' ')
#print words
phrases = []
startnumber = -1
endnumber = False
taglist = []
for i in range(len(wordlist)):
word = wordlist[i]
if i+1 < len(wordlist):
nextword = araby.strip_tashkeel(wordlist[i+1])
else: nextword = None
#save the original word with possible harakat if exist
word_nm = araby.strip_tashkeel(word)
key = word_nm
# the first word can have prefixes
if word_nm and not startnumber and word_nm != u'واحد' \
and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
key = word_nm[1:]
elif word_nm != u'واحد' and word_nm.startswith(u'و'):
key = word_nm[1:]
if nbconst.NumberWords.has_key(key):
if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
u'اثنتا') or nextword in (u'عشر', u'عشرة'):
if startnumber < 0:
startnumber = i
endnumber = i
# phrase.append(word)
else:
if startnumber >= 0: #There are a previous number phrase.
phrases.append((startnumber, endnumber))
startnumber = -1
# add the final phrases
if startnumber >= 0: #There are a previous number phrase.
phrases.append((startnumber, endnumber))
return phrases