import os from dotenv import load_dotenv from openai import AzureOpenAI import json load_dotenv() AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") # deployment name AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview") # Configure OpenAI for Azure client = AzureOpenAI( api_key=AZURE_OPENAI_KEY, api_version=AZURE_OPENAI_API_VERSION, azure_endpoint=AZURE_OPENAI_ENDPOINT ) def extract_medical_entities(text: str) -> dict: prompt = f""" You are a medical NER expert. Your task is to extract relevant entities from the given medical report text and return them in a JSON object. Analyze the text carefully and identify the following fields: - "exam_types": any type of medical test, examination, or diagnostic method performed on the patient. - "specialties": the branch of medicine or medical discipline relevant to the report. - "anatomical_regions": specific parts or regions of the body mentioned in the report. - "pathologies": diagnosed diseases, disorders, or abnormal medical conditions noted in the report. - "procedures": medical interventions, treatments, or actions performed on the patient. - "measurements": numerical values or quantities recorded in the report, such as vital signs, lab results, sizes, or pressures. - "medications": drugs, therapies, or prescribed substances mentioned in the report. - "symptoms": patient-experienced signs or observable indications of a health issue. Text to analyze: \"\"\" {text} \"\"\" Return ONLY a valid JSON object with all fields. If a field has no values, return an empty list. """ response = client.chat.completions.create( model=AZURE_OPENAI_DEPLOYMENT, messages=[{"role": "user", "content": prompt}], #temperature=0, #max_tokens=1024 ) content = response.choices[0].message.content try: return json.loads(content) except json.JSONDecodeError: return { "exam_types": [], "specialties": [], "anatomical_regions": [], "pathologies": [], "procedures": [], "measurements": [], "medications": [], "symptoms": [] } import json def save_annotation(text: str, labels: dict, output_file="dataset.jsonl"): record = { "text": text, "labels": labels } # append as one line of JSON with open(output_file, "a", encoding="utf-8") as f: f.write(json.dumps(record, ensure_ascii=False) + "\n") if __name__ == "__main__": input_folder = "data_txt" # 📂 folder containing your .txt files output_file = "dataset.json" # Ensure output file is empty before starting open(output_file, "w", encoding="utf-8").close() for filename in os.listdir(input_folder): if filename.endswith(".txt"): file_path = os.path.join(input_folder, filename) with open(file_path, "r", encoding="utf-8") as f: transcription = f.read().strip() print(f"\n=== Processing {filename} ===") entities = extract_medical_entities(transcription) # Save results save_annotation(transcription, entities, output_file=output_file) print(f"✅ Saved {filename} → {output_file}") """ if __name__ == "__main__": input_folder = "data_txt" # 📂 folder containing your .txt files output_file = "dataset.json" # Liste des fichiers à exclure excluded_files = { "template7.txt", "template1167.txt", "template429.txt", "template401.txt", "template367.txt", "template415.txt", "template398.txt", "template1198.txt", "template159.txt", "template165.txt", "template1107.txt", "template449.txt", "template1113.txt", "template313.txt", "template475.txt", "template461.txt", "template307.txt", "template893.txt", "template139.txt", "template887.txt", "template677.txt", "template111.txt", "template105.txt", "template663.txt", "template688.txt", "template850.txt", "template844.txt", "template878.txt", "template16.txt", "template703.txt", "template717.txt", "template924.txt", "template930.txt", "template918.txt", "template1073.txt", "template529.txt", "template1067.txt", "template267.txt", "template501.txt", "template515.txt", "template273.txt", "template298.txt", "template1098.txt", "template1099.txt", "template299.txt", "template514.txt", "template272.txt", "template266.txt", "template500.txt", "template528.txt", "template1066.txt", "template1072.txt", "template919.txt", "template931.txt", "template925.txt", "template716.txt", "template702.txt", "template879.txt", "template845.txt", "template851.txt", "template689.txt", "template104.txt", "template662.txt", "template676.txt", "template110.txt", "template138.txt", "template886.txt", "template892.txt", "template460.txt", "template306.txt", "template312.txt", "template474.txt", "template1112.txt", "template1106.txt", "template448.txt", "template338.txt", "template1110.txt", "template1104.txt", "template304.txt", "template462.txt", "template476.txt", "template310.txt", "template1138.txt", "template489.txt", "template884.txt", "template890.txt", "template648.txt", "template660.txt", "template106.txt", "template112.txt", "template674.txt", "template847.txt", "template853.txt", "template728.txt", "template15.txt", "template714.txt", "template29.txt", "template700.txt", "template933.txt", "template927.txt", "template1064.txt", "template1070.txt", "template258.txt", "template1058.txt", "template270.txt", "template516.txt", "template502.txt", "template264.txt", "template503.txt", "template265.txt", "template271.txt", "template1059.txt", "template517.txt", "template259.txt", "template1071.txt", "template1065.txt", "template926.txt", "template932.txt", "template701.txt", "template715.txt", "template28.txt", "template729.txt", "template14.txt", "template852.txt", "template846.txt", "template113.txt", "template675.txt", "template661.txt", "template107.txt", "template649.txt", "template891.txt", "template885.txt", "template488.txt", "template477.txt", "template1139.txt", "template311.txt", "template305.txt", "template463.txt", "template1105.txt", "template1111.txt", "template339.txt", "template467.txt", "template1129.txt", "template301.txt", "template315.txt", "template473.txt", "template1115.txt", "template1101.txt", "template329.txt", "template498.txt", "template103.txt", "template665.txt", "template671.txt", "template117.txt", "template881.txt", "template659.txt", "template895.txt", "template842.txt", "template856.txt", "template711.txt", "template705.txt", "template38.txt", "template10.txt", "template739.txt", "template936.txt", "template922.txt", "template513.txt", "template275.txt", "template261.txt", "template1049.txt", "template507.txt", "template249.txt", "template1061.txt", "template1075.txt", "template1074.txt", "template1060.txt", "template248.txt", "template1048.txt", "template260.txt", "template506.txt", "template512.txt", "template274.txt", "template923.txt", "template937.txt", "template738.txt", "template11.txt", "template704.txt", "template710.txt", "template857.txt", "template843.txt", "template894.txt", "template658.txt", "template880.txt", "template670.txt", "template116.txt", "template102.txt", "template664.txt", "template499.txt", "template328.txt", "template1100.txt", "template1114.txt", "template314.txt", "template472.txt", "template466.txt", "template300.txt", "template1128.txt", "template470.txt", "template316.txt", "template302.txt", "template464.txt", "template1102.txt", "template1116.txt", "template458.txt", "template114.txt", "template672.txt", "template666.txt", "template100.txt", "template128.txt", "template896.txt", "template882.txt", "template869.txt", "template855.txt", "template699.txt", "template841.txt", "template706.txt", "template712.txt", "template13.txt", "template909.txt", "template921.txt", "template935.txt", "template504.txt", "template262.txt", "template276.txt", "template510.txt", "template538.txt", "template1076.txt", "template1062.txt", "template1089.txt", "template289.txt", "template288.txt", "template1088.txt", "template1063.txt", "template539.txt", "template1077.txt", "template277.txt", "template511.txt", "template505.txt", "template263.txt", "template934.txt", "template920.txt", "template908.txt", "template12.txt", "template713.txt", "template707.txt", "template840.txt", "template698.txt", "template854.txt", "template868.txt", "template883.txt", "template129.txt", "template897.txt", "template667.txt", "template101.txt", "template115.txt", "template673.txt", "template1117.txt", "template459.txt", "template1103.txt", "template303.txt", "template465.txt", "template471.txt", "template317.txt", "template4.txt", "template1164.txt", "template1170.txt", "template358.txt", "template416.txt", "template1158.txt", "template370.txt", "template364.txt", "template402.txt", "template628.txt", "template172.txt", "template614.txt", "template600.txt", "template166.txt", "template833.txt", "template827.txt", "template199.txt", "template61.txt", "template1212.txt", "template984.txt", "template748.txt", "template990.txt", "template75.txt", "template1206.txt", "template760.txt", "template774.txt", "template49.txt", "template947.txt", "template953.txt", "template238.txt", "template1010.txt", "template1004.txt", "template562.txt", "template204.txt", "template210.txt", "template1038.txt", "template576.txt", "template589.txt", "template588.txt", "template1039.txt", "template211.txt", "template577.txt", "template563.txt", "template205.txt", "template1005.txt", "template1011.txt", "template239.txt", "template952.txt", "template946.txt", "template775.txt", "template48.txt", "template761.txt", "template991.txt", "template749.txt", "template1207.txt", "template74.txt", "template1213.txt", "template60.txt", "template985.txt", "template826.txt", "template198.txt", "template832.txt", "template601.txt", "template167.txt", "template173.txt", "template615.txt", "template629.txt", "template365.txt", "template403.txt", "template417.txt", "template371.txt", "template1159.txt", "template359.txt", "template1171.txt", "template1165.txt", "template5.txt", "template1173.txt", "template373.txt" } # Ensure output file is empty before starting open(output_file, "w", encoding="utf-8").close() processed_count = 0 excluded_count = 0 for filename in os.listdir(input_folder): if filename.endswith(".txt"): # Vérifier si le fichier est dans la liste d'exclusion if filename in excluded_files: print(f"⏭️ Fichier exclu : {filename}") excluded_count += 1 continue file_path = os.path.join(input_folder, filename) with open(file_path, "r", encoding="utf-8") as f: transcription = f.read().strip() print(f"\n=== Processing {filename} ===") entities = extract_medical_entities(transcription) # Save results save_annotation(transcription, entities, output_file=output_file) print(f"✅ Saved {filename} → {output_file}") processed_count += 1 print(f"\n📊 Résumé : {processed_count} fichiers traités, {excluded_count} fichiers exclus") """