Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| import tempfile | |
| from datetime import datetime, timezone | |
| import base64 | |
| from tqdm.auto import tqdm | |
| import pymupdf | |
| DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers" | |
| class PaperManager: | |
| def __init__(self, papers_per_page=30): | |
| self.papers = [] | |
| self.raw_papers = [] # To store fetched data | |
| def calculate_rising_score(self, paper): | |
| """ | |
| Calculate the rising score of a paper. | |
| This emphasizes recent upvotes and the rate of upvote accumulation. | |
| """ | |
| upvotes = paper.get('paper', {}).get('upvotes', 0) | |
| published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat()) | |
| try: | |
| published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00')) | |
| except ValueError: | |
| published_time = datetime.now(timezone.utc) | |
| time_diff = datetime.now(timezone.utc) - published_time | |
| time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours | |
| # Rising score favors papers that are gaining upvotes quickly | |
| # Adjusted to have a linear decay over time | |
| score = upvotes / (time_diff_hours + 1) | |
| return score | |
| def fetch_papers(self): | |
| try: | |
| response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100") | |
| response.raise_for_status() | |
| data = response.json() | |
| if not data: | |
| print("No data received from API.") | |
| return False | |
| self.raw_papers = data # Store raw data | |
| return True | |
| except requests.RequestException as e: | |
| print(f"Error fetching papers: {e}") | |
| return False | |
| except Exception as e: | |
| print(f"Unexpected error: {e}") | |
| return False | |
| def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7): | |
| self.papers = [] | |
| for paper in self.raw_papers: | |
| paper_score = self.calculate_rising_score(paper) | |
| # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent): | |
| self.papers.append(paper) | |
| self.papers = sorted( | |
| self.papers, | |
| key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1), | |
| reverse=True | |
| )[:2] | |
| return self.papers | |
| # def get_paper_content(self, paper_id): | |
| # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf" | |
| # print("Processing paper:", pdf_url) | |
| # client = httpx.Client(follow_redirects=True) | |
| # response = client.get(pdf_url) | |
| # # First verification - check if we got a valid PDF response | |
| # if response.status_code != 200: | |
| # raise Exception(f"Failed to fetch PDF: {response.status_code}") | |
| # if not response.headers.get('content-type', '').startswith('application/pdf'): | |
| # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}") | |
| # # Second verification - check the first few bytes of the content | |
| # if not response.content.startswith(b'%PDF'): | |
| # raise Exception("Content doesn't appear to be a valid PDF") | |
| # pdf_data = base64.standard_b64encode(response.content).decode("utf-8") | |
| # return {"pdf": pdf_data, "url": pdf_url} | |
| def get_paper_text(self, paper_id): | |
| url = f"https://arxiv.org/pdf/{paper_id}.pdf" | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to download PDF: {response.status_code}") | |
| with open("temp.pdf", "wb") as f: | |
| f.write(response.content) | |
| with pymupdf.open("temp.pdf") as doc: | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def get_top_content(self): | |
| self.fetch_papers() | |
| self.filter_top_papers() | |
| contents = {} | |
| print(f"Processing {len(self.papers)} papers:") | |
| for paper in tqdm(self.papers): | |
| paper_id = paper["paper"]['id'] | |
| contents[paper["paper"]['title']] = self.get_paper_text(paper_id) | |
| return contents |