Spaces:
Runtime error
Runtime error
| from nltk.stem import WordNetLemmatizer | |
| import pandas as pd | |
| import numpy as np | |
| import math | |
| import nltk | |
| import re | |
| nltk.download("wordnet") | |
| # nltk.download("omw-1.4") | |
| # Initialize wordnet lemmatizer | |
| wnl = WordNetLemmatizer() | |
| file1 = './assets/text1.txt' | |
| file2 = './assets/text2.txt' | |
| file3 = './assets/text3.txt' | |
| file4 = './assets/text4.txt' | |
| file5 = './assets/text5.txt' | |
| file6 = './assets/text6.txt' | |
| file7 = './assets/text7.txt' | |
| file8 = './assets/text8.txt' | |
| file9 = './assets/text9.txt' | |
| file10 = './assets/text10.txt' | |
| files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10] | |
| gist_file = open("gist_stopwords.txt", "r") | |
| try: | |
| content = gist_file.read() | |
| stopwords = content.split(",") | |
| finally: | |
| gist_file.close() | |
| def read_file(name): | |
| with open(name,'r') as file: | |
| contents = file.read(); | |
| return contents | |
| def process_string(name): | |
| text = ''.join(c.lower() for c in name) | |
| # remove punctuation using regex that matches only words or digits or underscore of length 1 or more | |
| tokens = re.findall(r'\w+', text) | |
| # remove commonly used words like 'is', 'the', 'a', etc. | |
| filtered_tokens = [token for token in tokens if token not in stopwords] | |
| # convert words to their root form ie 'running' to 'run' | |
| root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens] | |
| return root_tokens | |
| def process_tokens(tokens,st_global_words): | |
| # global st_global_words | |
| freq_dict = {} | |
| tf_dict = {} | |
| for word in st_global_words: | |
| freq_dict[word] = tokens.count(word) | |
| tf_dict[word] = freq_dict[word]/len(tokens) | |
| return freq_dict, tf_dict | |
| def main(input1,input2): | |
| processed_files = [ read_file(file) for file in files ] | |
| processed_files.insert(0,input2) | |
| processed_files.insert(0,input1) | |
| processed_strings = [ process_string(file) for file in processed_files ] | |
| st_global_words = set() | |
| for tokens in processed_strings: | |
| st_global_words.update(tokens) | |
| processed_tokens = [] | |
| for tokens in processed_strings: | |
| freq_dict, tf_dict = process_tokens(tokens,st_global_words) | |
| processed_tokens.append((freq_dict, tf_dict)) | |
| idf_dict = {} | |
| for word in st_global_words: | |
| cnt = 0 | |
| for freq_dict, tf_dict in processed_tokens: | |
| if freq_dict[word] > 0: | |
| cnt += 1 | |
| idf_dict[word] = math.log(len(processed_tokens)/cnt) | |
| df = pd.DataFrame({'word': list(st_global_words)}) | |
| df['idf_col']= [idf_dict[word] for word in st_global_words] | |
| for i, (freq_dict, tf_dict) in enumerate(processed_tokens): | |
| freq_col = [freq_dict[word] for word in st_global_words] | |
| tf_col = [tf_dict[word] for word in st_global_words] | |
| df['freq_{}'.format(i+1)] = freq_col | |
| df['tf_{}'.format(i+1)] = tf_col | |
| df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col'] | |
| tf_idf_cols = [col for col in df.columns if 'tfidf' in col] | |
| tf_idf_vals = [] | |
| for i in range(len(tf_idf_cols)): | |
| tf_idf_vals.append(df[tf_idf_cols[i]].values) | |
| tf_idf_vals = np.array(tf_idf_vals) | |
| return tf_idf_vals | |
| def cosine_diff(A,B): | |
| dot_product = sum(A[i]*B[i] for i in range(len(A))) | |
| norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))])) | |
| norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))])) | |
| similarity = dot_product / (norm_A * norm_B) | |
| return similarity | |
| def euclidean(A,B): | |
| su = 0 | |
| for i in range(len(A)): | |
| su += (A[i]-B[i])**2 | |
| return math.sqrt(su) | |
| def final_main(input1,input2): | |
| tf_idf_vals = main(input1,input2) | |
| outputString = "" | |
| similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1]) | |
| outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n" | |
| diff = euclidean(tf_idf_vals[0],tf_idf_vals[1]) | |
| outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n" | |
| return outputString | |