Spaces:

ricezilla
/

TF_IDF_Plagiarism_checker

Runtime error

has12zen

text

711ce4f almost 3 years ago

3.89 kB

	from nltk.stem import WordNetLemmatizer
	import pandas as pd
	import numpy as np
	import math
	import nltk
	import re

	nltk.download("wordnet")
	# nltk.download("omw-1.4")

	# Initialize wordnet lemmatizer
	wnl = WordNetLemmatizer()

	file1 = './assets/text1.txt'
	file2 = './assets/text2.txt'
	file3 = './assets/text3.txt'
	file4 = './assets/text4.txt'
	file5 = './assets/text5.txt'
	file6 = './assets/text6.txt'
	file7 = './assets/text7.txt'
	file8 = './assets/text8.txt'
	file9 = './assets/text9.txt'
	file10 = './assets/text10.txt'

	files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10]

	gist_file = open("gist_stopwords.txt", "r")
	try:
	content = gist_file.read()
	stopwords = content.split(",")
	finally:
	gist_file.close()

	def read_file(name):
	with open(name,'r') as file:
	contents = file.read();
	return contents

	def process_string(name):
	text = ''.join(c.lower() for c in name)
	# remove punctuation using regex that matches only words or digits or underscore of length 1 or more
	tokens = re.findall(r'\w+', text)
	# remove commonly used words like 'is', 'the', 'a', etc.
	filtered_tokens = [token for token in tokens if token not in stopwords]
	# convert words to their root form ie 'running' to 'run'
	root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens]
	return root_tokens

	def process_tokens(tokens,st_global_words):
	# global st_global_words
	freq_dict = {}
	tf_dict = {}
	for word in st_global_words:
	freq_dict[word] = tokens.count(word)
	tf_dict[word] = freq_dict[word]/len(tokens)
	return freq_dict, tf_dict

	def main(input1,input2):
	processed_files = [ read_file(file) for file in files ]
	processed_files.insert(0,input2)
	processed_files.insert(0,input1)
	processed_strings = [ process_string(file) for file in processed_files ]
	st_global_words = set()
	for tokens in processed_strings:
	st_global_words.update(tokens)
	processed_tokens = []
	for tokens in processed_strings:
	freq_dict, tf_dict = process_tokens(tokens,st_global_words)
	processed_tokens.append((freq_dict, tf_dict))
	idf_dict = {}
	for word in st_global_words:
	cnt = 0
	for freq_dict, tf_dict in processed_tokens:
	if freq_dict[word] > 0:
	cnt += 1
	idf_dict[word] = math.log(len(processed_tokens)/cnt)

	df = pd.DataFrame({'word': list(st_global_words)})
	df['idf_col']= [idf_dict[word] for word in st_global_words]
	for i, (freq_dict, tf_dict) in enumerate(processed_tokens):
	freq_col = [freq_dict[word] for word in st_global_words]
	tf_col = [tf_dict[word] for word in st_global_words]
	df['freq_{}'.format(i+1)] = freq_col
	df['tf_{}'.format(i+1)] = tf_col
	df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col']

	tf_idf_cols = [col for col in df.columns if 'tfidf' in col]
	tf_idf_vals = []
	for i in range(len(tf_idf_cols)):
	tf_idf_vals.append(df[tf_idf_cols[i]].values)
	tf_idf_vals = np.array(tf_idf_vals)
	return tf_idf_vals

	def cosine_diff(A,B):
	dot_product = sum(A[i]*B[i] for i in range(len(A)))
	norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))]))
	norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))]))
	similarity = dot_product / (norm_A * norm_B)
	return similarity

	def euclidean(A,B):
	su = 0
	for i in range(len(A)):
	su += (A[i]-B[i])**2

	return math.sqrt(su)

	def final_main(input1,input2):
	tf_idf_vals = main(input1,input2)
	outputString = ""
	similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1])
	outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n"
	diff = euclidean(tf_idf_vals[0],tf_idf_vals[1])
	outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n"
	return outputString