Spaces:

rahideer
/

cc

Sleeping

App Files Files Community

rahideer commited on Apr 26

Commit

12646dd

verified ·

1 Parent(s): a3f5843

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -61

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ warnings.filterwarnings("ignore")
 MODEL_NAME = "microsoft/codebert-base"
 MAX_LENGTH = 512
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-DATASET_PATH = "archive (1).zip"  # Update this path if needed
 # Initialize models with caching
 @st.cache_resource
@@ -39,36 +39,30 @@ def load_models():
 @st.cache_resource
 def load_dataset():
     try:
-        # Extract dataset if needed
         if not os.path.exists("Subject_CloneTypes_Directories"):
             with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
                 zip_ref.extractall(".")
-        # Load sample pairs (modify this based on your dataset structure)
         clone_pairs = []
         base_path = "Subject_CloneTypes_Directories"
-        # Example: Load one pair from each clone type
         for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
             type_path = os.path.join(base_path, clone_type)
             if os.path.exists(type_path):
                 for root, _, files in os.walk(type_path):
-                    if files:
-                        # Take first two files as a pair
-                        if len(files) >= 2:
-                            with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
-                                code1 = f1.read()
-                            with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
-                                code2 = f2.read()
-                            clone_pairs.append({
-                                "type": clone_type,
-                                "code1": code1,
-                                "code2": code2
-                            })
-                        break  # Just take one pair per type for demo
-        return clone_pairs[:10]  # Return first 10 pairs for demo
     except Exception as e:
         st.error(f"Error loading dataset: {str(e)}")
         return []
@@ -76,17 +70,15 @@ def load_dataset():
 tokenizer, code_model = load_models()
 dataset_pairs = load_dataset()
-# Normalization function
 def normalize_code(code):
     try:
-        code = re.sub(r'//.*', '', code)  # Remove single-line comments
-        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Multi-line comments
-        code = re.sub(r'\s+', ' ', code).strip()  # Normalize whitespace
         return code
     except Exception:
         return code
-# Embedding generation
 def get_embedding(code):
     try:
         code = normalize_code(code)
@@ -101,12 +93,11 @@ def get_embedding(code):
         with torch.no_grad():
             outputs = code_model(**inputs)
-        return outputs.last_hidden_state.mean(dim=1)  # Pooled embedding
     except Exception as e:
         st.error(f"Error processing code: {str(e)}")
         return None
-# Comparison function
 def compare_code(code1, code2):
     if not code1 or not code2:
         return None
@@ -125,9 +116,7 @@ def compare_code(code1, code2):
 # UI Elements
 st.title("🔍 Java Code Clone Detector (IJaDataset 2.1)")
-st.markdown("""
-Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.
-""")
 # Dataset selector
 selected_pair = None
@@ -154,52 +143,51 @@ with col2:
         value=selected_pair["code2"] if selected_pair else "",
         help="Enter the second Java code snippet"
     )
-# Threshold slider with proper value handling
 threshold = st.slider(
     "Clone Detection Threshold",
     min_value=0.50,
     max_value=1.00,
-    value=0.75,  # Default middle value
     step=0.01,
     help="Similarity score needed to consider code as cloned (0.5-1.0)"
 )
-# In your comparison logic:
-if similarity is not None:
-    # Display results with threshold comparison
-    is_clone = similarity >= threshold
-    st.subheader("Results")
-    col1, col2, col3 = st.columns(3)
-    with col1:
-        st.metric("Similarity Score", f"{similarity:.3f}")
-    with col2:
-        # Show the current threshold being used
-        st.metric("Current Threshold", f"{threshold:.3f}")
-    with col3:
-        # Visual clone decision
-        st.metric(
             "Verdict",
             "✅ CLONE" if is_clone else "❌ NOT CLONE",
             delta=f"{similarity-threshold:+.3f}",
             help=f"Score {'≥' if is_clone else '<'} threshold"
         )
-    # Visual indicator
-    st.progress(similarity)
-    # Interpretation guide
-    with st.expander("Interpretation Guide"):
-        st.markdown("""
-        - **> 0.95**: Nearly identical (Type-1 clone)
-        - **0.85-0.95**: Very similar (Type-2 clone)
-        - **0.70-0.85**: Similar structure (Type-3 clone)
-        - **< 0.70**: Different code
-        """)
-# Footer
 st.markdown("---")
 st.markdown("""
 **Dataset Information**:

 MODEL_NAME = "microsoft/codebert-base"
 MAX_LENGTH = 512
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+DATASET_PATH = "archive (1).zip"
 # Initialize models with caching
 @st.cache_resource
 @st.cache_resource
 def load_dataset():
     try:
         if not os.path.exists("Subject_CloneTypes_Directories"):
             with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
                 zip_ref.extractall(".")
         clone_pairs = []
         base_path = "Subject_CloneTypes_Directories"
         for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
             type_path = os.path.join(base_path, clone_type)
             if os.path.exists(type_path):
                 for root, _, files in os.walk(type_path):
+                    if files and len(files) >= 2:
+                        with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
+                            code1 = f1.read()
+                        with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
+                            code2 = f2.read()
+                        clone_pairs.append({
+                            "type": clone_type,
+                            "code1": code1,
+                            "code2": code2
+                        })
+                        break
+        return clone_pairs[:10]
     except Exception as e:
         st.error(f"Error loading dataset: {str(e)}")
         return []
 tokenizer, code_model = load_models()
 dataset_pairs = load_dataset()
 def normalize_code(code):
     try:
+        code = re.sub(r'//.*', '', code)
+        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
+        code = re.sub(r'\s+', ' ', code).strip()
         return code
     except Exception:
         return code
 def get_embedding(code):
     try:
         code = normalize_code(code)
         with torch.no_grad():
             outputs = code_model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1)
     except Exception as e:
         st.error(f"Error processing code: {str(e)}")
         return None
 def compare_code(code1, code2):
     if not code1 or not code2:
         return None
 # UI Elements
 st.title("🔍 Java Code Clone Detector (IJaDataset 2.1)")
+st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.")
 # Dataset selector
 selected_pair = None
         value=selected_pair["code2"] if selected_pair else "",
         help="Enter the second Java code snippet"
     )
 threshold = st.slider(
     "Clone Detection Threshold",
     min_value=0.50,
     max_value=1.00,
+    value=0.75,
     step=0.01,
     help="Similarity score needed to consider code as cloned (0.5-1.0)"
 )
+# Only perform comparison when button is clicked
+if st.button("Compare Code"):
+    similarity = compare_code(code1, code2)
+    if similarity is not None:
+        is_clone = similarity >= threshold
+        st.subheader("Results")
+        cols = st.columns(3)
+        cols[0].metric("Similarity Score", f"{similarity:.3f}")
+        cols[1].metric("Current Threshold", f"{threshold:.3f}")
+        cols[2].metric(
             "Verdict",
             "✅ CLONE" if is_clone else "❌ NOT CLONE",
             delta=f"{similarity-threshold:+.3f}",
             help=f"Score {'≥' if is_clone else '<'} threshold"
         )
+        st.progress(similarity)
+        with st.expander("Interpretation Guide"):
+            st.markdown("""
+            - **> 0.95**: Nearly identical (Type-1 clone)
+            - **0.85-0.95**: Very similar (Type-2 clone)
+            - **0.70-0.85**: Similar structure (Type-3 clone)
+            - **< 0.70**: Different code
+            """)
+        with st.expander("Show normalized code"):
+            tab1, tab2 = st.tabs(["First Code", "Second Code"])
+            with tab1:
+                st.code(normalize_code(code1))
+            with tab2:
+                st.code(normalize_code(code2))
 st.markdown("---")
 st.markdown("""
 **Dataset Information**: