Skip to content

A python search engine build with NLP methods for a django project

Notifications You must be signed in to change notification settings

Hermann-web/Search-engine-with-python-nlp

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

25 Commits
 
 
 
 
 
 

Repository files navigation

SearchEngine

In this post, we will be building a semantic documents search engine

##Prerequistes

  • Python >=3.7
  • NLTK
  • Pandas
  • Scikit-learn

##Prerequistes

import re, json
import unicodedata, string
import time
import operator
import numpy as np 
import pandas as pd
from collections import Counter
from collections import defaultdict
import nltk 
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

##data Files used in the notebook are stored in the folder data

1: Créer les keywords à partir d'une phrase en se basant sur les mots d'un dictionnaire et un corpus de texte en passant par la tokenization, la correction, la lemmatization et le removeStopWords


##preprocessing

def get_dico():
    textdir = "liste.de.mots.francais.frgut_.txt"
    try:DICO = open(textdir,'r',encoding="utf-8").read()
    except: DICO = open(textdir,'r').read()
    
    return DICO


def remove_accents(input_str):
    """This method removes all diacritic marks from the given string"""
    norm_txt = unicodedata.normalize('NFD', input_str)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

def clean_sentence(texte):
    # Replace diacritics
    texte = remove_accents(texte)
    # Lowercase the document
    texte = texte.lower()
    # Remove Mentions
    texte = re.sub(r'@\w+', '', texte)
    # Remove punctuations
    texte = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', texte)
    # Remove the doubled space
    texte = re.sub(r'\s{2,}', ' ', texte)
    #remove whitespaces at the beginning and the end
    texte = texte.strip()
    
    return texte


def tokenize_sentence(texte):
        #clean the sentence 
    texte = clean_sentence(texte)
        #tokenize 
    liste_words = texte.split()
        #return 
    return liste_words

def strip_apostrophe(liste_words):
    get_radical = lambda word: word.split('\'')[-1]
    return list(map(get_radical,liste_words))

def pre_process(sentence):
    #remove '_' from the sentence 
    sentence = sentence.replace('_','')
    
    #get words fro the sentence 
    liste_words = tokenize_sentence(sentence)
    #cut out 1 or 2 letters ones 
    liste_words = [elt for elt in liste_words if len(elt)>2]
    #prendre le radical après l'apostrophe
    liste_words = strip_apostrophe(liste_words)
    print('\nsentence to words : ',liste_words)
    return liste_words

##correction des mots

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)
    
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])




def DICO_ET_CORRECTEUR():
    "cette fonction retourne la liste des mots de dictionnaire"
    DICO = get_dico()
    WORDS = Counter(pre_process(DICO)) #Counter prends un str et retourne une sorte de liste enrichie
    "correction des mots "
    N = sum(WORDS.values())
    P = lambda word: WORDS[word] / N #"Probability of `word`."
    
    correction = lambda word: max(candidates(word), key=P) #"Most probable
    return WORDS,correction

WORDS,CORRECTION = DICO_ET_CORRECTEUR()

##stopwords et stemming(premier exemple)


##stopwords #//https://www.ranks.nl/stopwords/french
with open('stp_words_.txt','r') as f:
    STOPWORDS = f.read()

##bdd de stemmer
with open("sample_.json",'r',encoding='cp1252') as json_file:
    #json_file.seek(0)
    LISTE = json.load(json_file)
my_stemmer = lambda word: LISTE[word] if word in LISTE else word

##fonction: SENTENCE_TO_CORRECT_WORDS

def SENTENCE_TO_CORRECT_WORDS(sentence):
    "cette fonction retourne la liste des mots du user"
    print('\n------------pre_process--------\n')
    liste_words = pre_process(sentence)
    print(liste_words)
    print('\n------------correction--------\n')
    liste_words = list(map(CORRECTION,liste_words))
    print(liste_words)
    print('\n------------stemming--------\n')
    liste_words = list(map(my_stemmer,liste_words))
    print(liste_words)
    print('\n------------remove stop-words--------\n')
    liste_words = [elt for elt in liste_words if elt not in STOPWORDS]
    print(liste_words)
    print('\n-------------------------------------\n')
    return liste_words

##Test: SENTENCE_TO_CORRECT_WORDS

SENTENCE_TO_CORRECT_WORDS('La PR reste au statut «\xa0Approuve(e)\xa0» et il n’y a pas de commande\"\'')

##Output

------------pre_process--------
['reste', 'statut', 'approuve', 'n’y', 'pas', 'commande']

------------correction--------
['reste', 'statut', 'approuve', 'non', 'pas', 'commande']

------------stemming--------
['rester', 'statut', 'approuver', 'non', 'pas', 'commander']

------------remove stop-words--------
['rester', 'statut', 'approuver', 'commander']

-------------------------------------
['rester', 'statut', 'approuver', 'commander']


##Create dataset

def open_file(textdir):
  found = False
  try:texte = open(textdir,'r',encoding="utf-8").read();found=True
  except:pass
  try: texte = open(textdir,'r').read();found=True 
  except: pass
  if not found:
    texte = open(textdir,'r',encoding='cp1252').read();found=True
  return  texte
def add_col(df_news,titre,keywords):
  return df_news.append(dict(zip(df_news.columns,[titre, keywords])), ignore_index=True)

liste_pb = [elt for elt in open_file('liste_pb_.txt').split('\n') if elt]
df_new = df_news.drop(df_news.index)
for i,titre in enumerate(liste_pb):
  keywords = ','.join(SENTENCE_TO_CORRECT_WORDS(titre))
  df_new = add_col(df_new,titre,keywords)
df_new.head()


##Output

 	                   Subject 	                                  Clean_Keyword
0 	Message d'erreur : "Le fournisseur ARIBA n'exi... 	message,erreur,fournisseur,aria,exister
1 	Message d'erreur : "Commande d’article non aut... 	message,erreur,commander,article,autoriser,oto
2 	Message d'erreur : "Statut utilisateur FERM ac... 	message,erreur,statut,utilisateur,actif,oto
3 	Message d'erreur : "Statut systeme TCLO actif ... 	message,erreur,statut,systeme,col,actif,nord
4 	Message d'erreur "___ Cost center change could... 	message,erreur,coat,centrer,changer,cold,affecter
5 	Messaeg d'erreur "___ OTP change could not be ... 	message,erreur,otp,changer,cold,affecter
6 	Messaeg d'erreur "Entrez Centre de couts" 	        message,erreur,entrer,centrer,cout
7 	Message d'erreur "Indiquez une seule imputatio... 	message,erreur,indiquer,imputation,statistique
8 	Message d'erreur "Imputations CO ont des centr... 	message,erreur,imputation,centrer,profit
9 	Message d'erreur "Poste ___ Ordre ___ depassem... 	message,erreur,poster,ordre,depassement,budget
10 	Message d'erreur "Entrez une quantite de comma... 	message,erreur,entrer,quantite,commander
11 	Message d'erreur "Indiquez la quantite" 	        message,erreur,indiquer,quantite
12 	Message d'erreur "Le prix net doit etre superi... 	message,erreur,prix,net,superieur
... 	... 	...
... 	... 	...
... 	... 	...
57 	UO4-5 Commande | Envoi d'une commande manuelle 	uo4,commander,envoi,commander,manuel
58 	UO5-4 Reception | Anomalie workflow 	uo5,reception,anomalie,workflow
59 	UO5-1 Reception | Modification(s) de reception(s) 	uo5,reception,modification,reception
60 	UO5-2 Reception | Annulation(s) de reception(s) 	uo5,reception,annulation,reception
61 	UO5-3 Reception | Forcer la reception 	uo5,reception,forcer,reception
62 	UO3-5 Demande d'achat | Demande de support cre... 	uo3,demander,achat,demander,support,creation
63 	UO3-6 Demande d'achat | Demande de support mod... 	uo3,demander,achat,demander,support,modification
64 	UO3-7 Demande d'achat | Demande de support ann... 	uo3,demander,achat,demander,support,annulation
65 	UO4-2 Commande | Demande de support modificati... 	uo4,commander,demander,support,modification,co...

##tokenize and stemming(second exemple)

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
def wordLemmatizer(data,colname):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    file_clean_k =pd.DataFrame()
    for index,entry in enumerate(data):
        
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if len(word)>1 and word not in stopwords.words('french') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
            # The final processed set of words for each iteration will be stored in 'text_final'
                file_clean_k.loc[index,colname] = str(Final_words)
                file_clean_k.loc[index,colname] = str(Final_words)
                file_clean_k=file_clean_k.replace(to_replace ="\[.", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace ="'", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace =" ", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace ='\]', value = '', regex = True)

    return file_clean_k


def wordLemmatizer_(sentence):
    #prendre une phrase que retourner un str (les mots sont separes par des ,)
    preprocessed_query = preprocessed_query = re.sub("\W+", " ", sentence).strip()
    tokens = word_tokenize(str(preprocessed_query))
    q_df = pd.DataFrame(columns=['q_clean'])
    idx = 0
    colname = 'keyword_final'
    q_df.loc[idx,'q_clean'] =tokens
    print('\n\n---inputtoken');print(q_df.q_clean)
    print('\n\n---outputlemma');print(wordLemmatizer(q_df.q_clean,colname).loc[idx,colname])
    return wordLemmatizer(q_df.q_clean,colname).loc[idx,colname]

2: trouver la meilleure phrase dans une liste de phrase


method: TF-Idf


TfIdf stands for: Term Frequency Inverse Document Frequency In order to compare the user input to existing sentence in database, we will go throught two process

  • Normalize database: Apply the pre-processing method to all sentences in the database. We then have, for each sentence, a list of keywords
  • For each keyword kw for each sentence st, we compute,
    • $frqc(word,sentence)$ : occurrence of the keyword word in the sentence
    • $doc_frqc(word)$: number of sentences where the word appears
    • $N$ = Number of sentences
{r, message=FALSE}
$$
tf(wd,stc) =  \frac {frqc(wd,stc)}{ \sum_{stc} frqc(wd,stc) }\\
idf(wd) =  \log(\frac{N}{doc\_frqc(wd)})\\
tfidf(wd,stc) = tf(wd,stc) *idf(wd)
$$
                     frqc(wd,stc)      
tf(wd,stc)  =   ---------------------- 
                  __                   
                 \       frqc(wd,stc)  
                 /__ stc               
                                 
                           N           
    idf(wd)  =   log(------------)     
                     doc_frqc(wd)      
                                       
tfidf(wd,stc)  =  tf(wd,stc)  * idf(wd)

Example

st1: The computer is down st2: We need to change the computers st3: Changements have to be handle by the IT

keywords per sentence

st1: [computer , down]
st2: [need, change, computer]
st3: [change, handle, IT]

vocabulary: [computer , down, need, change, handle, IT]

tf sentence1 sentence2 sentence3
computer 1/2 1/2 0
down 1 0 0
need 0 1 0
change 0 1/2 1/2
handle 0 0 1
IT 0 0 1

idf values for the keywords

N =number_of_sentences = 3

idf
computer log(3/2)
down log(3/1)
need log(3/1)
change log(3/2)
handle log(3/1)
IT log(3/1)

example for sentence 2: computing of the keywords tfidf values

{r, echo=FALSE}
$$\\
tfidf('computer') = tf('computer', sentence2)*idf('computer') = 1/2 * log(3/2)\\
tfidf('down') = 0 * log(3/1)\\
tfidf('need') = 1 * log(3/1)\\
tfidf('change') = 1/2 * log(3/2)\\
tfidf('handle') = 0 * log(3/1)\\
tfidf('IT') = 0 * log(3/1)\\
$$
tfidf('computer')  =  tf('computer', sentence2) * idf('computer')  =  1 / 2  * log(3 / 2)                                  
                                                                              
                      tfidf('down')  =  0  *  log(3 / 1)                      
                                                                              
                      tfidf('need')  =  1  *  log(3 / 1)                      
                                                                              
                   tfidf('change')  =  1 / 2  *  log(3 / 2)                   
                                                                              
                     tfidf('handle')  =  0  *  log(3 / 1)                     
                                                                              
                       tfidf('IT')  =  0  *  log(3 / 1)                       
                                                                              

vectorisation of the sentence 2

sentence2 <==> [ 0.5 * log(3 / 2), 0, 1 * log(3 / 2), 0.5 *  log(3 /2) , 0, 0]                                      

vectorisation of the sentences

{r, echo=FALSE}
$$
sentence1 <==> [\ 0.5*log(3/2),\ log(3/1),\ 0 ,\ 0,\ 0]\\
sentence2 <==> [\ 0.5*log(3/2),\ 0, 1*log(3/2),\ 0.5* log(3/2) ,\ 0,\ 0]\\
sentence3 <==> [\ 0, \ 0, \ 0,\ 0.5 * 1*log(3/2),\  log(3/1),\ 1*log(3/1)]
$$
          sentence1 <==> [ 0.5 * log(3 / 2), log(3 / 1), 0 , 0, 0]          
                                                                                
    sentence2 <==> [ 0.5 * log(3 / 2), 0, 1 * log(3 / 2), 0.5 *  log(3 /2) , 0, 0]                                   
                                                                                
sentence3 <==> [ 0,  0,  0, 0.5  *  1 * log(3 / 2),  log(3 / 1), 1 * log(3 /1)]                                       

similarities between the user input and the sentences

user input: The IT have replaced all of the computers keywords: [ 'IT', 'all', 'computer'] keywords found in dictionnary: [ 'IT','computer'] vectorization: [1,0,0,0,1]

scores

{r, echo=FALSE}
$$
sentence1: tfidf(sentence1)*vector 
=  [\ 0.5*log(3/2),\ log(3/1),\ 0 ,\ 0,\ 0] *[1,0,0,0,1]
=  0.5*log(3/2) \\
sentence1:0.5*log(3/2)\\
sentence2:  0.5*log(3/2)\\
sentence3: log(3/1)
$$
sentence1: score = tfidf(sentence1) * vector  
                 = [ 0.5 * log(3 / 2), log(3 /1), 0 , 0, 0]  * [1,0,0,0,1] 
                 =   0.5 * log(3 / 2)           
                                                                     
                     sentence1:0.5 * log(3 / 2)                      
                                                                     
                    sentence2:  0.5 * log(3 / 2)                     
                                                                     
                        sentence3: log(3 / 1)                        

fonction: cosine_similarity_T


def init(df_news):
  ## Create Vocabulary
  vocabulary = set()
  for doc in df_news.Clean_Keyword:
      vocabulary.update(doc.split(','))
  vocabulary = list(vocabulary)# Intializating the tfIdf model
  tfidf = TfidfVectorizer(vocabulary=vocabulary)# Fit the TfIdf model
  tfidf.fit(df_news.Clean_Keyword)# Transform the TfIdf model
  tfidf_tran=tfidf.transform(df_news.Clean_Keyword)
  globals()['vocabulary'],globals()['tfidf'],globals()['tfidf_tran'] = vocabulary,tfidf,tfidf_tran



Create a vector for Query/search keywords


def gen_vector_T(tokens,df_news,vocabulary,tfidf,tfidf_tran):
  Q = np.zeros((len(vocabulary)))    
  x= tfidf.transform(tokens)
  #print(tokens[0].split(','))
  #print(keywords)
  for token in tokens[0].split(','):
      
      try:
          ind = vocabulary.index(token)
          Q[ind]  = x[0, tfidf.vocabulary_[token]]
          print(token,':',ind)
      except:
          print(token,':','not found')
          pass
  return Q

Cosine Similarity function


def cosine_sim(a, b):
    if not np.linalg.norm(a) and not np.linalg.norm(b): return -3
    if not np.linalg.norm(a):return -1
    if not np.linalg.norm(b):return -2
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim   

def cosine_similarity_T(k, query,df_news,vocabulary=None,tfidf=None,tfidf_tran=None,mine=True):
    try:
      vocabulary = globals()['vocabulary']
      tfidf = globals()['tfidf']
      tfidf_tran = globals()['tfidf_tran']
    except:
      print('up exception')
      init(df_news)
    q_df = pd.DataFrame(columns=['q_clean'])
    if mine:q_df.loc[0,'q_clean'] =','.join(SENTENCE_TO_CORRECT_WORDS(query))
    else:q_df.loc[0,'q_clean'] = wordLemmatizer_(query)
    
    
    print('\n\n---q_df');print(q_df)
    
    print('\n\n')
    d_cosines = []
    query_vector = gen_vector_T(q_df['q_clean'],df_news,vocabulary,tfidf,tfidf_tran )
    for d in tfidf_tran.A:
        d_cosines.append(cosine_sim(query_vector, d ))
                    
    out = np.array(d_cosines).argsort()[-k:][::-1]
    #print("")
    d_cosines.sort()
    a = pd.DataFrame()
    for i,index in enumerate(out):
        a.loc[i,'index'] = str(index)
        a.loc[i,'Subject'] = df_news['Subject'][index]
    for j,simScore in enumerate(d_cosines[-k:][::-1]):
        a.loc[j,'Score'] = simScore
    return a

Test: cosine_similarity_T


def test(data,sentence,init_=False,mine=True):
  if not init_:
    deb = time.time();print('\n\n###########')
    init(df_news)
    print('\n###########temps init: ', time.time()-deb)
  deb = time.time();print('\n\n###########')
  print(cosine_similarity_T(10, sentence,df_news))
  print('\n###########temps methode 1: ', time.time()-deb)
sentence = 'Message d\'erreur \"La qte livree est differente de la qte facturee ; fonction impossible"'
sentence = 'erreur de conversion'
sentence = 'message d\'erreur'
sentence = "groupe d'acheteurs non défini"
sentence = "UO4"
sentence = "le fournisseur MDM n'existe pas"
init(df_new) 

cosine_similarity_T(10,sentence,df_new )

Output


------------pre_process--------
['fournisseur', 'mdm', 'existe', 'pas']

------------correction--------
['fournisseur', 'mdm', 'existe', 'pas']

------------stemming--------
['fournisseur', 'mdm', 'exister', 'pas']

------------remove stop-words--------
['fournisseur', 'mdm', 'exister']

-------------------------------------

       index 	                 Subject 	                         Score
0 	19 	Message d'erreur "Le fournisseur MDM___ n’exis... 	0.781490
1 	0 	Message d'erreur : "Le fournisseur ARIBA n'exi... 	0.600296
2 	20 	Message d'erreur "Le fournisseur MDM___ est bl... 	0.587467
3 	14 	Message d'erreur "Le centre de profit __ n'exi... 	0.236420
4 	33 	Message d'erreur "Il existe des factures pour ... 	0.214371
5 	53 	Message d'erreur "Fournisseur non present dans... 	0.142208
6 	18 	Message d'erreur "Validation ___ : le compte _... 	0.000000
7 	30 	Message d'erreur "Renseigner correctement le d... 	0.000000
8 	29 	Message d'erreur "Article ___ non gere dans la... 	0.000000
9 	28 	Message d'erreur "Fonctions oblig. Suivantes n... 	0.000000
... 	... 	...