prshanthreddy commited on
Commit
d256084
Β·
verified Β·
1 Parent(s): cf40291

Upload 8 files

Browse files
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Webphishing
3
- emoji: 😻
4
  colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
  app_file: app.py
 
1
  ---
2
  title: Webphishing
3
+ emoji: 🐨
4
  colorFrom: green
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from urllib.parse import urlparse
3
+ import re
4
+ import numpy as np
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import whois
8
+ from datetime import datetime
9
+ import pandas
10
+ import pickle
11
+ import os
12
+ from dotenv import load_dotenv
13
+
14
+ # Load environment variables from .env
15
+ load_dotenv()
16
+
17
+ MOZ_ACCESS_ID = os.getenv("MOZ_ACCESS_ID")
18
+ MOZ_SECRET_KEY = os.getenv("MOZ_SECRET_KEY")
19
+ SERPAPI_KEY = os.getenv("SERPAPI_KEY")
20
+
21
+
22
+ with open("phishing_svm_model.pkl", "rb") as f:
23
+ svm_pipeline = pickle.load(f)
24
+ with open("phishing_knn_model.pkl", "rb") as f:
25
+ knn_pipeline = pickle.load(f)
26
+ with open("phishing_rf_model.pkl", "rb") as f:
27
+ rf_pipeline = pickle.load(f)
28
+
29
+
30
+
31
+
32
+ # Map features to their source
33
+ feature_sources = {
34
+ # Auto-extracted features
35
+ 'length_url': 'Calculated from URL', 'length_hostname': 'Calculated from URL', 'ip': 'Calculated from URL', 'nb_dots': 'Calculated from URL',
36
+ 'nb_qm': 'Calculated from URL', 'nb_eq': 'Calculated from URL', 'nb_slash': 'Calculated from URL', 'nb_www': 'Calculated from URL',
37
+ 'ratio_digits_url': 'Calculated from URL', 'ratio_digits_host': 'Calculated from URL', 'tld_in_subdomain': 'Calculated from URL',
38
+ 'prefix_suffix': 'Calculated from URL', 'shortest_word_host': 'Calculated from URL', 'longest_words_raw': 'Calculated from URL',
39
+ 'longest_word_path': 'Calculated from URL', 'phish_hints': 'Calculated from URL',
40
+ # API-extracted features
41
+ 'domain_age': 'API', 'google_index': 'API', 'page_rank': 'API',
42
+ 'empty_title': 'API', 'domain_in_title': 'API'
43
+
44
+ }
45
+
46
+
47
+ all_features = [
48
+ 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_qm', 'nb_eq',
49
+ 'nb_slash', 'nb_www', 'ratio_digits_url', 'ratio_digits_host',
50
+ 'tld_in_subdomain', 'prefix_suffix', 'shortest_word_host',
51
+ 'longest_words_raw', 'longest_word_path', 'phish_hints',
52
+ 'nb_hyperlinks', 'ratio_intHyperlinks', 'empty_title',
53
+ 'domain_in_title', 'domain_age', 'google_index', 'page_rank'
54
+ ]
55
+
56
+ auto_features = [
57
+ 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_qm', 'nb_eq',
58
+ 'nb_slash', 'nb_www', 'ratio_digits_url', 'ratio_digits_host',
59
+ 'tld_in_subdomain', 'prefix_suffix', 'shortest_word_host',
60
+ 'longest_words_raw', 'longest_word_path', 'phish_hints'
61
+ ]
62
+
63
+ manual_features = list(set(all_features) - set(auto_features))
64
+ manual_features.sort()
65
+
66
+ def extract_from_url(url):
67
+ parsed = urlparse(url)
68
+ hostname = parsed.hostname or ""
69
+ path = parsed.path or ""
70
+
71
+ features = {
72
+ 'length_url': len(url),
73
+ 'length_hostname': len(hostname),
74
+ 'ip': 1 if re.fullmatch(r"(\d{1,3}\.){3}\d{1,3}", hostname) else 0,
75
+ 'nb_dots': url.count('.'),
76
+ 'nb_qm': url.count('?'),
77
+ 'nb_eq': url.count('='),
78
+ 'nb_slash': url.count('/'),
79
+ 'nb_www': url.count('www'),
80
+ 'ratio_digits_url': sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
81
+ 'ratio_digits_host': sum(c.isdigit() for c in hostname) / len(hostname) if len(hostname) > 0 else 0,
82
+ 'tld_in_subdomain': int(any(tld in hostname.split('.')[:-1] for tld in ['com', 'net', 'org'])),
83
+ 'prefix_suffix': int('-' in hostname),
84
+ 'shortest_word_host': min((len(w) for w in hostname.split('.')), default=0),
85
+ 'longest_words_raw': max((len(w) for w in url.split('/')), default=0),
86
+ 'longest_word_path': max((len(w) for w in path.split('/')), default=0),
87
+ 'phish_hints': int(any(k in url.lower() for k in ['secure', 'login', 'paypal', 'ebay', 'bank']))
88
+ }
89
+ # print(features)
90
+ return features
91
+
92
+ def get_domain_age(domain):
93
+ try:
94
+ info = whois.whois(domain)
95
+ creation = info.creation_date
96
+ if isinstance(creation, list):
97
+ creation = creation[0]
98
+ age = (datetime.now() - creation).days if creation else 0
99
+ return age
100
+ except Exception as e:
101
+ return 0
102
+
103
+ def get_title_features(url):
104
+ try:
105
+ res = requests.get(url, timeout=5)
106
+ soup = BeautifulSoup(res.content, "html.parser")
107
+ title = soup.title.string if soup.title else ""
108
+ hostname = urlparse(url).hostname or ""
109
+ return {
110
+ "empty_title": int(title.strip() == ""),
111
+ "domain_in_title": int(hostname.lower().split('.')[0] in title.lower()) if title else 0
112
+ }
113
+ except:
114
+ return {"empty_title": 1, "domain_in_title": 0}
115
+
116
+ def get_page_rank(url):
117
+ # Uncomment if you have Moz API credentials
118
+ endpoint = f"https://lsapi.seomoz.com/v2/url_metrics"
119
+ headers = {"Content-Type": "application/json"}
120
+ response = requests.post(
121
+ endpoint,
122
+ json={"targets": [url]},
123
+ auth=(MOZ_ACCESS_ID, MOZ_SECRET_KEY),
124
+ headers=headers
125
+ )
126
+ return response.json()["results"][0]["page_authority"]
127
+
128
+ # return 0 # Placeholder for demo
129
+
130
+ def is_google_indexed(url):
131
+ # Uncomment if you have SerpAPI
132
+ search_url = f"https://serpapi.com/search?engine=google&q=site:{url}&api_key={SERPAPI_KEY}"
133
+ res = requests.get(search_url).json()
134
+ return 1 if res.get("organic_results") else 0
135
+
136
+ # return 0 # Placeholder for demo
137
+
138
+ def predict_from_url(url, model_choice, *manual_inputs):
139
+ auto_vals = extract_from_url(url)
140
+ hostname = urlparse(url).hostname or ""
141
+
142
+ # API features
143
+ auto_vals['domain_age'] = get_domain_age(hostname)
144
+ auto_vals['page_rank'] = get_page_rank(url)
145
+ auto_vals['google_index'] = is_google_indexed(url)
146
+ title_feats = get_title_features(url)
147
+ auto_vals.update(title_feats)
148
+
149
+ manual_features_remaining = [f for f in manual_features if f not in auto_vals]
150
+ manual_vals = dict(zip(manual_features_remaining, manual_inputs))
151
+
152
+ # Build input
153
+ full_input = []
154
+ feature_rows = []
155
+ for f in all_features:
156
+ if f in auto_vals:
157
+ val = auto_vals[f]
158
+ source = feature_sources.get(f, "Auto")
159
+ elif f in manual_vals:
160
+ val = manual_vals[f]
161
+ source = "Manual"
162
+ else:
163
+ val = None
164
+ source = "Manual"
165
+ full_input.append(val)
166
+ feature_rows.append({"Feature": f, "Value": val, "Source": source})
167
+
168
+ X = np.array(full_input).reshape(1, -1)
169
+ # Model selection
170
+ if model_choice == "SVM":
171
+ prediction = svm_pipeline.predict(X)[0]
172
+ elif model_choice == "Random Forest":
173
+ prediction = rf_pipeline.predict(X)[0]
174
+ else: # KNN
175
+ prediction = knn_pipeline.predict(X)[0]
176
+
177
+ result_str = "Phishing 🚨 (1)" if prediction == 1 else "Legitimate βœ… (0)"
178
+ df = pd.DataFrame(feature_rows)
179
+ return result_str, df
180
+
181
+
182
+ # Manual features needed for input
183
+ manual_inputs = [gr.Number(label=f"{f} (manual)") for f in manual_features if f not in [
184
+ 'domain_age', 'page_rank', 'google_index', 'empty_title', 'domain_in_title'
185
+ ]]
186
+
187
+ app = gr.Interface(
188
+ fn=predict_from_url,
189
+ inputs=[
190
+ gr.Text(label="Enter URL"),
191
+ gr.Dropdown(choices=["SVM", "KNN", "Random Forest"], label="Choose Model", value="KNN"),
192
+ *manual_inputs
193
+ ],
194
+ outputs=[
195
+ gr.Text(label="Prediction"),
196
+ gr.Dataframe(label="Calculated Features Table")
197
+ ],
198
+ title="πŸ” Advanced URL Phishing Detector",
199
+ description="See all extracted and provided features, their values, and their source (Auto, API, Manual)."
200
+ )
201
+
202
+ app.launch(share=True, debug=True)
phishing_knn_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84bece316404cc3935010ee07ce9aaa706cc48ab4473635abdf758146c5b2ecd
3
+ size 132
phishing_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6638b41ae2098992ec578a0b01d2b4e3c299b68e8d58c544c85d447b1c7942a9
3
+ size 132
phishing_rf_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba881e19a2f8be43ee6eeb458b594bcec5387444af1aab2a48dbefea0b91abb1
3
+ size 132
phishing_svm_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be4ceae37d96deea0192dbea47fb1c6abde7817ff2ac5a4114585ef5472f893e
3
+ size 131
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ joblib>=1.2
3
+ gradio>=4.0
4
+ scikit-learn>=1.2
5
+ pandas>=1.4
6
+ numpy>=1.21
7
+ requests>=2.28
8
+ beautifulsoup4>=4.11
9
+ python-whois>=0.8
10
+ python-dotenv
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a6f34d80224c50be00fd6dd9675361decf10508e64e9408f6fd89bd62f1c66
3
+ size 129