当前位置: 首页>>代码示例>>Python>>正文


Python araby.strip_tashkeel函数代码示例

本文整理汇总了Python中pyarabic.araby.strip_tashkeel函数的典型用法代码示例。如果您正苦于以下问题:Python strip_tashkeel函数的具体用法?Python strip_tashkeel怎么用?Python strip_tashkeel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了strip_tashkeel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

    def __init__(self, result_dict = None, order = -1):
        # ToDo
        # copy the super class attributes to curesult_dictrrent classe
        #stemmedword.stemmedWord.__init__(self, result_dict.get_dict())
        
        if result_dict: 
            self.__dict__ = result_dict.__dict__.copy()
            self.unvocalized =  araby.strip_tashkeel(self.vocalized)
            self.unvoriginal =  araby.strip_tashkeel(self.original)
        self.tag_verbal_factor  =   0
        self.tag_nominal_factor =   0
        self.tag_kana_rafe3     =   False 
        if self.is_verb():
            self.tag_kana_rafe3 =   self._is_kana_rafe3() 
        if self.is_stopword():
            self.tag_kana_rafe3 =   self._is_kana_rafe3()  
            self.tag_nominal_factor = self.__get_nominal_factor()
            #verbal factor
            self.tag_verbal_factor  = self.__get_verbal_factor()

        self.tag_addition =  self._is_addition()                

        self.tag_break =  self._is_break() 
        self.forced_word_case = False
        self.syntax =  u""   # used for syntaxique analysis porpos
        self.semantic =  u""  # used for semantic analysis porposes
        self.forced_wordtype = False        
        self.order =  order
        self.next =  {}
        self.previous =  {}
        self.sem_next =  {}
        self.sem_previous =  {}
        self.score =  0
        self.rule = 0  # rule used to select the current case in vocalization
开发者ID:linuxscout,项目名称:mishkal,代码行数:34,代码来源:stemmedsynword.py

示例2: check_normalized

def check_normalized(word_vocalised, resulted_data):
    """
    If the entred word is like the found word in dictionary, 
    to treat some normalized cases, 
    the analyzer return the vocalized like words
    ُIf the word is ذئب, the normalized form is ذءب, 
    which can give from dictionary ذئبـ ذؤب.
    this function filter normalized resulted word according 
    the given word, and give ذئب.
    @param word_vocalised: the input word.
    @type word_vocalised: unicode.
    @param resulted_data: the founded resulat from dictionary.
    @type resulted_data: list of dict.
    @return: list of dictionaries of analyzed words with tags.
    @rtype: list.
    """
    #print word_vocalised.encode('utf8')
    filtred_data = []
    inputword = araby.strip_tashkeel(word_vocalised)
    for item in  resulted_data:
        if 'vocalized' in item.__dict__ : #.has_key('vocalized') :
        #~ if 'vocalized' in item :
            #~ outputword = araby.strip_tashkeel(item['vocalized'])
            outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
            #print u'\t'.join([inputword, outputword]).encode('utf8')
            if inputword == outputword:
                #item['tags'] += ':a'
                filtred_data.append(item)
    return  filtred_data
开发者ID:ihfazhillah,项目名称:mishkal,代码行数:29,代码来源:analex.py

示例3: get_word_variant

def get_word_variant(word, suffix, encletic):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( first level).
    @type suffix: unicode.
    @param encletic: encletic( second level).
    @type encletic: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    
    suffix_nm = araby.strip_tashkeel(suffix)

    encletic_nm = araby.strip_tashkeel(encletic)
    long_suffix_nm = suffix_nm + encletic_nm 
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)
    
    # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية
    if word_stem.endswith(araby.TEH_MARBUTA):
        if suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, 
    araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
            word_stem = word_stem[:-1]
        # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق
        #مدرسة +ين = مدرستين
        elif long_suffix_nm != u"":
            word_stem = word_stem[:-1]+araby.TEH
       

    elif word_stem.endswith(araby.ALEF_MAKSURA):
        # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء
        # مستوى +ان = مستويان        
 # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء
         if suffix_nm != u"":
            word_stem = word_stem[:-1]+araby.YEH
        # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل  تتحول الألف المقصورة إلى ألف
         elif encletic_nm != u"":
            word_stem = word_stem[:-1]+araby.ALEF 
    elif word_stem.endswith(araby.KASRA + araby.YEH):
     # الاسم المنقوص ينتهي بياء قبلها مكسور
     # إذا كان لا ضمير واللاحقة فقط حركات
     # نحذف ال
         if not encletic_nm  and not suffix_nm :
            word_stem = word_stem[:-2] 

        #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية         
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA            
    return word_stem
开发者ID:ihfazhillah,项目名称:mishkal,代码行数:58,代码来源:stem_noun.py

示例4: detect_number_phrases_position

def detect_number_phrases_position(wordlist):
    """
    Detect number words in a text and return positions of each phrase.

    Example:
        >>> txt = u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا"
        >>> wordlist = araby.tokenize(txt)
        >>> positions_phrases = detect_number_phrases_position(wordlist)
        >>> print positions_phrase
        >>> print positions_phrases
        [(1, 3), (6, 7)]

    @param wordlist: wordlist
    @type wordlist: unicode list
    @return: list of numbers clause positions [(start,end),(start2,end2),]
    @rtype: list of tuple
    """
    #~ wordlist# = text.split(u' ')
    #print words
    phrases = []
    startnumber = -1
    endnumber = False
    #~ taglist = []
    for i, word in enumerate(wordlist):
        #~ word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else: nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes
        if word_nm and not startnumber and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if key in nbconst.NUMBER_WORDS or key.isnumeric():
            if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر', u'عشرة'):
                if startnumber < 0:
                    startnumber = i
                endnumber = i
            # phrase.append(word)
        else:
            if startnumber >= 0: #There are a previous number phrase.
                phrases.append((startnumber, endnumber))
            startnumber = -1
    # add the final phrases
    if startnumber >= 0: #There are a previous number phrase.
        phrases.append((startnumber, endnumber))

    return phrases
开发者ID:linuxscout,项目名称:pyarabic,代码行数:53,代码来源:number.py

示例5: is_possible_collocation

    def is_possible_collocation(self, list2, context = "", lenght = 2):
        """
        Guess if the given list is a possible collocation
        This is used to collect unkown collocations, from user input
        return True oor false
        @param wordlist: word of list, 2 or more words.
        @type wordlist: list of unicode.
        @param lenght: minimum number of words in the collocation
        @type lenght: integer.        
        @return : the rule of found collocation, 100 default.
        @rtype: interger.
        """        
        if len(list2)<lenght:
            return 0
        else:
            item_v1 = list2[0]
            item_v2 = list2[1]
            item1 = araby.strip_tashkeel(item_v1)
            item2 = araby.strip_tashkeel(item_v2)        
            #if item1[-1:] in (u".", u"?", u", ", u'[', u']', u'(', ')'):
            #    return 0
            if  not cconst.token_pat.search(item1) or not \
            cconst.token_pat.search(item2) :
                return -1
            #else: return 100
            elif item1 in cconst.ADDITIONAL_WORDS :
                return 10
            elif item1 in cconst.NAMED_PRIOR :
                return 15            
            elif (item2 not in cconst.SPECIAL_DEFINED):
                if  item2.startswith(u'ال') and  item1.startswith(u'ال'):
                    return 20
                elif item1.endswith(u'ة') and item2.startswith(u'ال'):
                    return 30

                #حالة الكلمات التي تبدأ بلام الجر والتعريف 
                # لا داعي لها لأنها دائما مجرورة
                #if  item2.startswith(u'لل'):
                #    return 40
                elif item1.endswith(u'ة') and item2.endswith(u'ة')  :
                    return 40
                #if item1.endswith(u'ي') and item2.endswith(u'ي'):
                #    return 60

                elif  context != u"" and context in cconst.tab_noun_context \
                and item2.startswith(u'ال') :
                    return 50
                #return True

                elif item1.endswith(u'ات') and item2.startswith(u'ال') :
                    return 60
            return 100
开发者ID:assem-ch,项目名称:mishkal,代码行数:52,代码来源:collocations.py

示例6: detect_numbers

def detect_numbers(wordlist):
    """
    Detect number words in a text and return a taglist as BIO.

    Example:
        >>> wordlist = araby.tokenize(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
        >>> detect_numbers(wordlist)
        ['DO', 'DB', 'DI', 'DI', 'DO', 'DO', 'DB', 'DI', 'DO']

    @param wordlist: wordlist
    @type wordlist: unicode list
    @return: list of tags BIO
    @rtype: list of unicode
    """
    #~ phrases = []
    starts = False
    taglist = []

    for i, word in enumerate(wordlist):
        #~ word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else:
            nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes
        if word_nm and not starts and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if key in nbconst.NUMBER_WORDS or key.isnumeric():
            if key not in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر', u'عشرة'):
                if not starts:
                    taglist.append("DB")
                    starts = True
                else:
                    taglist.append("DI")
            else:
                starts = False
                taglist.append("O")
        else:
            starts = False
            taglist.append("O")
    return taglist
开发者ID:linuxscout,项目名称:pyarabic,代码行数:48,代码来源:number.py

示例7: detect_number_words

def detect_number_words(text):
    """
    Detect number words in a text.
    @param text: input text
    @type text: unicode
    @return : number words extracted from text
    @rtype: integer
    >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
    خمسمئة وثلاثة وعشرين
    """

    #~ words = araby.tokenize(text)
    #print words
    phrases_context = extract_number_context(text)
    for ph_con in phrases_context:
        if len(ph_con) >= 3:
            previous = ph_con[0]
            phrase = ph_con[1]
            nextword = ph_con[2]
            numberedwords = phrase
            numeric = text2number(numberedwords)
            tags = get_previous_tag(previous)
            vocalized = vocalize_number(araby.strip_tashkeel(\
            numberedwords).split(' '), tags)                
            #calcul  vocalization similarity : 
            sim = araby.vocalized_similarity(numberedwords, vocalized)
            voc_unit = vocalize_unit(numeric, nextword)
            sim_unit = araby.vocalized_similarity(voc_unit, \
                nextword)                    
            if sim < 0:
                print u'\t'.join([str(sim), numberedwords, vocalized, \
                 str(numeric), u' '.join([previous, phrase, nextword]), \
                  nextword, voc_unit, str(sim_unit)]).encode('utf8')
开发者ID:ihfazhillah,项目名称:mishkal,代码行数:33,代码来源:number.py

示例8: wordtag

def wordtag(text):
    """
    word tagginginto noun, verb, tool
    """
    import naftawayh.wordtag
    tagger = naftawayh.wordtag.WordTagger()
    word_list = token_text(text)

    if len(word_list) == 0:
        return []
    else:
        list_result = []
        second_previous =""
        previous = u""
        #~previous_tag  =  ""        
        for word in word_list:
            word_nm = araby.strip_tashkeel(word)
            tag = ''
            if tagger.is_stopword(word):
                tag = 't'
            else:
                if tagger.is_noun(word):
                    tag += 'n'
                if tagger.is_verb(word):
                    tag += 'v'
                if tag in ("", "nv"):
                    tag = tagger.context_analyse(previous, word)+"1"
                    if tag in ("", "nv1", "vn1"):
                        tag = tagger.context_analyse(u" ".join([second_previous, previous]), word)+"2"                    
            list_result.append({'word':word, 'tag': tag})
            second_previous = previous
            previous = word_nm
            #~previous_tag  =  tag
        return list_result
开发者ID:linuxscout,项目名称:mishkal,代码行数:34,代码来源:adaat.py

示例9: vocalize_named

def vocalize_named(wordlist, syn_tags = ""):
    """ Vocalize a number words
    @param wordlist: words to vocalize
    @type wordlist: unicode list
    @param syn_tags: tags about the clause
    @type syn_tags: unicode
    @return: the vocalized wordlist.
    @rtype: unicode
    """
    newlist = []    
    #~ prefix = u""    
    #~ nextword = u""    
    #detect tags 
    # we can pass tags to this number word
    tags =  syn_tags    
    bin_count = 0    
    for i in range(len(wordlist)):
        #save the original word with possible harakat if exist
        word = wordlist[i]    
        word_nm = araby.strip_tashkeel(word)    
        # the first word can have prefixes 
        if i == 0 and word_nm:  
            # word to get majrour tag
            if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
                tags += u"مجرور"    
            elif word_nm in (u'أبو', ):
                tags += u"مرفوع"    
            elif word_nm in (u'أبا', ):
                tags += u"منصوب"    
开发者ID:assem-ch,项目名称:mishkal,代码行数:29,代码来源:named.py

示例10: get_word_variant

def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    # print word.encode('utf8')
    #HARAKAT = (FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, 
    # KASRATAN, FATHATAN)
    suffix_nm = araby.strip_tashkeel(suffix)
    #if the word ends by a haraka
    word_stem = araby.strip_lastharaka(word_stem)

    if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (
    araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, 
    araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
        word_stem = word_stem[:-1]
    elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.TEH
    elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.YEH            
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
        elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA            
    return word_stem
开发者ID:tazjel,项目名称:mishkal,代码行数:35,代码来源:stem_noun.py

示例11: get_suffix_variants

def get_suffix_variants(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif  not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix):
        newsuffix = u""        
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is 
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark 
开发者ID:tazjel,项目名称:mishkal,代码行数:31,代码来源:stem_noun.py

示例12: get_word_variant

def get_word_variant(word, suffix):
    """
    Get the word variant to be joined to the suffix.
    For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    """
    word_stem = word
    suffix_nm = araby.strip_tashkeel(suffix)
    #if the word ends by a haraka strip the haraka if the suffix is not null
    if suffix:
        word_stem = araby.strip_lastharaka(word_stem)

    if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
        word_stem = word_stem[:-1]+araby.YEH            
    elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
        if suffix.startswith(araby.DAMMA):
            word_stem = word_stem[:-1] + araby.WAW_HAMZA
        elif suffix.startswith(araby.KASRA):
            word_stem = word_stem[:-1] + araby.YEH_HAMZA
            
    return word_stem
开发者ID:assem-ch,项目名称:mishkal,代码行数:26,代码来源:stem_stop.py

示例13: search_arabic

 def search_arabic(self, q, fetch_subgraph = True, limit = DEFAULT_LIMIT,
                   fetchplan = DEFAULT_FETCHPLAN):
     """
     Searches for given label intelligently handling vocalization.
     (This does not make much sense without a fetchplan as you will get
     index nodes only.)
     
     """
     # If query is not vocalized, search unvocalized index and eventually
     # return subtree
     if not araby.is_vocalized(q):
         return self.search_index(q, fetch_subgraph,
                                  "ArabicNode.unvocalized_label", limit,
                                  fetchplan)
         
     # If it is vocalized, search unvocalized index and check for
     # "compatibility" of vocalization
     matches = self.search_index(araby.strip_tashkeel(q), False,
                                 "ArabicNode.unvocalized_label", limit)
     rids = [n.rid for n in matches.primary_results
             if Tools.is_vocalized_like(q, n.data["label"])]
     # Ignore vocalization if there is no compatible one
     if not rids:
         rids = [n.rid for n in matches.primary_results]
     return self.get_nodes(rids, fetch_subgraph, limit, fetchplan)
开发者ID:mirko-vogel,项目名称:shabaka,代码行数:25,代码来源:ArabicWordGraph.py

示例14: set_vocalized

 def set_vocalized(self, newvocalized):
     """
     Set the vocalized word
     @param newvocalized: the new given vocalized.
     @type newvocalized: unicode string
     """
     self.vocalized = newvocalized
     self.unvocalized = araby.strip_tashkeel(newvocalized)
开发者ID:assem-ch,项目名称:mishkal,代码行数:8,代码来源:stemmedword.py

示例15: detect_number_phrases_position

def detect_number_phrases_position(wordlist):
    """
    Detect number words in a text and return positions of each phrase.
    @param wordlist: wordlist
    @type wordlist: unicode list
    @return : list of numbers clause positions [(start,end),(start2,end2),]
    @rtype: list of tuple
    >>> detect_number_phrases_position(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
    (1،3)، (6،7)
    """
    #~ wordlist# = text.split(u' ')
    #print words
    phrases = []
    startnumber = -1
    endnumber = False
    taglist = []
    for i in range(len(wordlist)):
        word = wordlist[i]
        if i+1 < len(wordlist):
            nextword = araby.strip_tashkeel(wordlist[i+1])
        else: nextword = None
        #save the original word with possible harakat if exist
        word_nm = araby.strip_tashkeel(word)
        key = word_nm
        # the first word can have prefixes 
        if word_nm and not startnumber and word_nm != u'واحد' \
            and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if nbconst.NumberWords.has_key(key):
            if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني',  u'اثنتي', \
             u'اثنتا')  or nextword in (u'عشر',  u'عشرة'):
                if startnumber < 0:
                    startnumber = i
                endnumber = i
            # phrase.append(word)
        else:
            if startnumber >= 0: #There are a previous number phrase.
                phrases.append((startnumber, endnumber))
            startnumber = -1
    # add the final phrases 
    if startnumber >= 0: #There are a previous number phrase.
        phrases.append((startnumber, endnumber))

    return phrases
开发者ID:ihfazhillah,项目名称:mishkal,代码行数:46,代码来源:number.py


注:本文中的pyarabic.araby.strip_tashkeel函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。