Natural Language Processing - Porter Stemmer Algorithm
Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma. Porter-Stemmer algorithm was presented in 1980 for English words. It is based on the idea that the suffixes in the English language are made up of a combination of smaller and simple suffixes. This algorithm was developed by Martin Porter at the University of Cambridge. This is a process for removing the commoner morphological and inflexional endings from words in English. Its main use is as part of a term normalization process that is usually done in various programming languages. The Porter Stemmer is a linear step stemmer. It applies morphological rules sequentially allowing removing affixes in stages. It is based in 5 steps and the rules take effect until one of them passes the conditions. If a rule is accepted, the suffix is removed accordingly and the next step is made. The resultant stem at the end of the fifth step is returned. The rule is as follows:
modified python code for the algorithm is mentioned below. you can run the code using a python environment and give user inputs as words.
class PorterStemmer:
def isCons(self,letter):
if letter=='a' or letter=='e' or letter =='i' or letter=='o' or letter=='u':
return False
else:
return True
def isConsonant(self,word,i):
letter=word[i]
if self.isCons(letter):
if letter=='y' and self.isCons(word[i-1]):
return False
else:
return True
def isVowel(self,word,i):
return not(self.isConsonant(word,i))
def endsWith(self,stem,letter):
if stem.endswith(letter):
return True
else:
return False
def containsVowel(self,stem):
for i in stem:
if not self.isCons(i):
return True
return False
def doubleCons(self,stem):
if len(stem)>=2:
if self.isConsonant(stem,-1) and self.isConsonant(stem,-2):
return True
else:
return False
else:
return False
def getForm(self,word):
form=[]
formStr=''
for i in range(len(word)):
if self.isConsonant(word,i):
if i!=0:
prev=form[-1]
if prev!='C':
form.append('C')
else:
form.append('C')
else:
if i!=0:
prev=form[-1]
if prev!='V':
form.append('V')
else:
form.append('V')
for j in form:
formStr+=j
return formStr
def getM(self,word):
form=self.getForm(word)
m=form.count('VC')
return m
def cvc(self,word):
if len(word)>=3:
f=-3
s=-2
t=-1
third=word[t]
if self.isConsonant(word,f) and self.isVowel(word,s) and self.isConsonant(word,t):
if third!='w' and third!='x' and third!='z':
return True
else:
return False
else:
return False
else:
return False
def replace(self,orig,rem,rep):
result=orig.rfind(rem)
base=orig[:result]
replaced=base+rep
return replaced
def replaceM0(self, orig, rem, rep):
result = orig.rfind(rem)
base = orig[:result]
if self.getM(base) > 0:
replaced = base + rep
return replaced
else:
return orig
def replaceM1(self, orig, rem, rep):
result = orig.rfind(rem)
base = orig[:result]
if self.getM(base) > 1:
replaced = base + rep
return replaced
else:
return orig
def step1a(self, word):
if word.endswith('sses'):
word = self.replace(word, 'sses', 'ss')
elif word.endswith('ies'):
word = self.replace(word, 'ies', 'i')
elif word.endswith('ss'):
word = self.replace(word, 'ss', 'ss')
elif word.endswith('s'):
word = self.replace(word, 's', '')
else:
pass
return word
def step1b(self, word):
flag = False
if word.endswith('eed'):
result = word.rfind('eed')
base = word[:result]
if self.getM(base) > 0:
word = base
word += 'ee'
elif word.endswith('ed'):
result = word.rfind('ed')
base = word[:result]
if self.containsVowel(base):
word = base
flag = True
elif word.endswith('ing'):
result = word.rfind('ing')
base = word[:result]
if self.containsVowel(base):
word = base
flag = True
if flag:
if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
word+='e'
elif self.doubleCons(word) and not self.endsWith(word,'l') and not self.endsWith(word,'s') and not self.endsWith(word,'z'):
word=word[:-1]
elif self.getM(word)==1 and self.cvc(word):
word+='e'
else:
pass
else:
pass
return word
def step1c(self, word):
if word.endswith('y'):
result = word.rfind('y')
base = word[:result]
if self.containsVowel(base):
word = base
word += 'i'
return word
def step2(self, word):
if word.endswith('ational'):
word = self.replaceM0(word, 'ational', 'ate')
elif word.endswith('tional'):
word = self.replaceM0(word, 'tional', 'tion')
elif word.endswith('enci'):
word = self.replaceM0(word, 'enci', 'ence')
elif word.endswith('anci'):
word = self.replaceM0(word, 'anci', 'ance')
elif word.endswith('izer'):
word = self.replaceM0(word, 'izer', 'ize')
elif word.endswith('abli'):
word = self.replaceM0(word, 'abli', 'able')
elif word.endswith('alli'):
word = self.replaceM0(word, 'alli', 'al')
elif word.endswith('entli'):
word = self.replaceM0(word, 'entli', 'ent')
elif word.endswith('eli'):
word = self.replaceM0(word, 'eli', 'e')
elif word.endswith('ousli'):
word = self.replaceM0(word, 'ousli', 'ous')
elif word.endswith('ization'):
word = self.replaceM0(word, 'ization', 'ize')
elif word.endswith('ation'):
word = self.replaceM0(word, 'ation', 'ate')
elif word.endswith('ator'):
word = self.replaceM0(word, 'ator', 'ate')
elif word.endswith('alism'):
word = self.replaceM0(word, 'alism', 'al')
elif word.endswith('iveness'):
word = self.replaceM0(word, 'iveness', 'ive')
elif word.endswith('fulness'):
word = self.replaceM0(word, 'fulness', 'ful')
elif word.endswith('ousness'):
word = self.replaceM0(word, 'ousness', 'ous')
elif word.endswith('aliti'):
word = self.replaceM0(word, 'aliti', 'al')
elif word.endswith('iviti'):
word = self.replaceM0(word, 'iviti', 'ive')
elif word.endswith('biliti'):
word = self.replaceM0(word, 'biliti', 'ble')
return word
def step3(self, word):
if word.endswith('icate'):
word = self.replaceM0(word, 'icate', 'ic')
elif word.endswith('ative'):
word = self.replaceM0(word, 'ative', '')
elif word.endswith('alize'):
word = self.replaceM0(word, 'alize', 'al')
elif word.endswith('iciti'):
word = self.replaceM0(word, 'iciti', 'ic')
elif word.endswith('ful'):
word = self.replaceM0(word, 'ful', '')
elif word.endswith('ness'):
word = self.replaceM0(word, 'ness', '')
return word
def step4(self, word):
if word.endswith('al'):
word = self.replaceM1(word, 'al', '')
elif word.endswith('ance'):
word = self.replaceM1(word, 'ance', '')
elif word.endswith('ence'):
word = self.replaceM1(word, 'ence', '')
elif word.endswith('er'):
word = self.replaceM1(word, 'er', '')
elif word.endswith('ic'):
word = self.replaceM1(word, 'ic', '')
elif word.endswith('able'):
word = self.replaceM1(word, 'able', '')
elif word.endswith('ible'):
word = self.replaceM1(word, 'ible', '')
elif word.endswith('ant'):
word = self.replaceM1(word, 'ant', '')
elif word.endswith('ement'):
word = self.replaceM1(word, 'ement', '')
elif word.endswith('ment'):
word = self.replaceM1(word, 'ment', '')
elif word.endswith('ent'):
word = self.replaceM1(word, 'ent', '')
elif word.endswith('ou'):
word = self.replaceM1(word, 'ou', '')
elif word.endswith('ism'):
word = self.replaceM1(word, 'ism', '')
elif word.endswith('ate'):
word = self.replaceM1(word, 'ate', '')
elif word.endswith('iti'):
word = self.replaceM1(word, 'iti', '')
elif word.endswith('ous'):
word = self.replaceM1(word, 'ous', '')
elif word.endswith('ive'):
word = self.replaceM1(word, 'ive', '')
elif word.endswith('ize'):
word = self.replaceM1(word, 'ize', '')
elif word.endswith('ion'):
result = word.rfind('ion')
base = word[:result]
if self.getM(base) >1 and (self.endsWith(base,'s') or self.endsWith(base,'t')):
word=base
word=self.replaceM1(word,'','')
return word
def step5a(self, word):
if word.endswith('e'):
base = word[:-1]
if self.getM(base) > 1:
word = base
elif self.getM(base) == 1 and not self.cvc(base):
word = base
return word
def step5b(self, word):
if self.getM(word) > 1 and self.doubleCons(word) and self.endsWith(word, 'l'):
word=word[:-1]
return word
def stem(self, word):
word = self.step1a(word)
word = self.step1b(word)
word = self.step1c(word)
word = self.step2(word)
word = self.step3(word)
word = self.step4(word)
word = self.step5a(word)
word = self.step5b(word)
return word
word=input("Enter the word: ")
line=PorterStemmer()
print(line.stem(word))
please comment if you have any concerns about the code or the algorithm . i will mention the link below for the pseudo code for the porter stemmer algorithm.
adios! 😎
Comments
Post a Comment