From 1831ce5c1706af8c8b6258c4acc0db48ff9506a3 Mon Sep 17 00:00:00 2001 From: Rodrigo Gonzalez Laiz Date: Tue, 16 Sep 2025 17:43:11 +0200 Subject: [PATCH 1/5] WIP basecamp paper formatter --- .../papers/basecamp_papers_formatter.py | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 src/PaperBee/papers/basecamp_papers_formatter.py diff --git a/src/PaperBee/papers/basecamp_papers_formatter.py b/src/PaperBee/papers/basecamp_papers_formatter.py new file mode 100644 index 0000000..32bdd01 --- /dev/null +++ b/src/PaperBee/papers/basecamp_papers_formatter.py @@ -0,0 +1,235 @@ +import time +import requests +import html +from typing import List, Optional, Tuple, Dict, Any +import pandas as pd +from logging import Logger + +# Example: pip install requests + + +class BasecampPaperPublisher: + """ + Publish papers (from a spreadsheet) to a Basecamp Message Board. + + Args: + account_id: Basecamp account id (the {ACCOUNT_ID} in URLs). + client_id, client_secret: OAuth credentials (from launchpad.37signals.com). + access_token: optional initial access token. + refresh_token: refresh token (used to obtain new access tokens). + user_agent: string identifying your app (required by Basecamp). + logger: logging.Logger instance. + """ + + LAUNCHPAD_TOKEN_URL = "https://launchpad.37signals.com/authorization/token" + API_BASE = "https://3.basecampapi.com" + + def __init__(self, + account_id: str, + client_id: str, + client_secret: str, + user_agent: str, + logger: Logger, + bucket_id: str, + board_id: str, + access_token: Optional[str] = None, + refresh_token: Optional[str] = None): + self.account_id = account_id + self.client_id = client_id + self.client_secret = client_secret + self.user_agent = user_agent + self.logger = logger + + self.access_token = access_token + self.refresh_token = refresh_token + self._access_expires_at = 0 # epoch seconds when token expires (if known) + + # small session for connection pooling + self._session = requests.Session() + self._session.headers.update({ + "User-Agent": user_agent, + "Accept": "application/json" + }) + + # user-agent: MyApp (yourname@example.com)' \ + + # ---------------------------- + # Authentication helpers + # ---------------------------- + def _ensure_access_token(self) -> None: + """Ensure we have a valid access token; refresh if needed.""" + if not self.access_token or time.time() >= self._access_expires_at - 30: + self.logger.debug("Refreshing Basecamp access token...") + self._refresh_access_token() + + def _refresh_access_token(self) -> None: + """Refresh access token using refresh_token.""" + if not self.refresh_token: + raise RuntimeError( + "No refresh_token available to refresh access token.") + + data = { + "type": "refresh", # community examples use this type for refresh + "client_id": self.client_id, + "client_secret": self.client_secret, + "refresh_token": self.refresh_token, + } + resp = requests.post(self.LAUNCHPAD_TOKEN_URL, + data=data, + headers={"User-Agent": self.user_agent}) + if resp.status_code != 200: + self.logger.error("Failed to refresh Basecamp token: %s %s", + resp.status_code, resp.text) + resp.raise_for_status() + payload = resp.json() + self.access_token = payload.get("access_token") + # some endpoints return 'expires_in' seconds; store expiry timestamp if provided + expires_in = payload.get("expires_in") or payload.get("expires") + if expires_in: + self._access_expires_at = time.time() + int(expires_in) + # if the server returned a new refresh_token, update it + if payload.get("refresh_token"): + self.refresh_token = payload["refresh_token"] + + # update session auth header + self._session.headers.update({ + "Authorization": f"Bearer {self.access_token}", + "Content-Type": "application/json; charset=utf-8" + }) + + # ---------------------------- + # Discovery helpers + # ---------------------------- + # def list_projects(self) -> List[Dict[str, Any]]: + # """Return list of projects for the account.""" + # self._ensure_access_token() + # url = f"{self.API_BASE}/{self.account_id}/projects.json" + # r = self._session.get(url) + # r.raise_for_status() + # return r.json() + + # def find_project_by_name(self, + # project_name: str) -> Optional[Dict[str, Any]]: + # """Find the project dict by (case-insensitive) name. Returns the first match or None.""" + # projects = self.list_projects() + # for p in projects: + # if p.get("name") and p["name"].lower() == project_name.lower(): + # return p + # return None + + # def list_message_boards(self, + # project_bucket_id: str) -> List[Dict[str, Any]]: + # """List message boards under a project's bucket id.""" + # self._ensure_access_token() + # url = f"{self.API_BASE}/{self.account_id}/buckets/{project_bucket_id}/message_boards.json" + # r = self._session.get(url) + # r.raise_for_status() + # return r.json() + + # def find_message_board(self, project_bucket_id: str, + # board_name: str) -> Optional[ + # Dict[str, Any], + # ]: + # boards = self.list_message_boards(project_bucket_id) + # for b in boards: + # if b.get("name") and b["name"].lower() == board_name.lower(): + # return b + # return None + + # ---------------------------- + # Formatting / content helpers + # ---------------------------- + @staticmethod + def _escape_html(text: str) -> str: + return html.escape(text) + + def build_message( + self, + papers: List[str], + preprints: List[str], + ) -> str: + """ + Build a simple HTML body for a Basecamp Message's `content` field. + Basecamp uses HTML rich text for message content. + """ + parts = [] + parts.append( + f"

Good morning ☕ Here are today's papers!

") + parts.append("
") + if preprints: + parts.append("

Preprints


") + else: + parts.append("

No preprints today.


") + + if papers: + parts.append("

Papers

") + else: + parts.append("

No papers today.

") + + parts.append("
") + parts.append('

Posted automatically by paperbee

') + return "".join(parts) + + # ---------------------------- + # Publish + # ---------------------------- + def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: + """ + Find project + board, and create a Message. + Returns the created message JSON on success. + """ + self._ensure_access_token() + + # # find project -> get its bucket id + # project = self.find_project_by_name(project_name) + # if not project: + # raise RuntimeError( + # f"Project named '{project_name}' not found in account {self.account_id}." + # ) + + # # NOTE: in many Basecamp examples the project's "id" is its bucket id; some responses include 'id' or 'bucket' keys. + # # We'll try to use project['id'] as the bucket id. + # bucket_id = str( + # project.get("id") or project.get("bucket", {}).get("id")) + # if not bucket_id: + # raise RuntimeError( + # "Could not determine project bucket id from project metadata.") + + # board = self.find_message_board(bucket_id, board_name) + # if not board: + # raise RuntimeError( + # f"Message board '{board_name}' not found under project '{project_name}'." + # ) + + # board_id = str(board["id"]) + + # if not subject: + # subject = f"Papers — {today or ''}".strip() + + papers, preprints = self.format_papers(papers_list) + content_html = self.build_message(papers, preprints) + + body = { + "subject": "New subject here", + "content": content_html, + "status": "active" + } + + url = f"{self.API_BASE}/{self.account_id}/buckets/{self.bucket_id}/message_boards/{self.board_id}/messages.json" + + # self.session already has the headers, I think we don't need to pass them again + r = self._session.post(url, json=body) + if r.status_code not in (200, 201): + self.logger.error("Failed to create message: %s %s", r.status_code, + r.text) + r.raise_for_status() + self.logger.info("Posted message to Basecamp board") + return r.json() From 47d05a6d0fbe9665c198f9c85bffe3f40f5fa763 Mon Sep 17 00:00:00 2001 From: Rodrigo Gonzalez Laiz Date: Wed, 17 Sep 2025 16:04:49 +0200 Subject: [PATCH 2/5] first working version of basecamp poster --- files/config_template.yml | 19 ++- src/PaperBee/daily_posting.py | 74 +++++++-- .../papers/basecamp_papers_formatter.py | 106 ++++++++----- src/PaperBee/papers/papers_finder.py | 140 ++++++++++++++---- src/PaperBee/papers/validate_inputs.py | 7 +- 5 files changed, 254 insertions(+), 92 deletions(-) diff --git a/files/config_template.yml b/files/config_template.yml index 73db683..7a462de 100644 --- a/files/config_template.yml +++ b/files/config_template.yml @@ -49,8 +49,19 @@ MATTERMOST: team: "your-mattermost-team-name" # The team name (not display name) channel: "your-mattermost-channel-name" # The channel name (not display name) -SLACK_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary -TELEGRAM_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary -MATTERMOST_TEST_CHANNEL_ID: "your-mattermost-test-channel-id" # not required so left outside of dictionary -GOOGLE_TEST_SPREADSHEET_ID: "your-google-test-spreadsheet-id" # not required so left outside of dictionary +# Basecamp configuration +BASECAMP: + is_posting_on: true + account_id: "your-basecamp-account-id" + client_id: "your-basecamp-client-id" + client_secret: "your-basecamp-client-secret" + user_agent: "your-basecamp-user-agent" + bucket_id: "your-basecamp-bucket-id" + board_id: "your-basecamp-board-id" + + +#SLACK_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary +#TELEGRAM_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary +#MATTERMOST_TEST_CHANNEL_ID: "your-mattermost-test-channel-id" # not required so left outside of dictionary +#GOOGLE_TEST_SPREADSHEET_ID: "your-google-test-spreadsheet-id" # not required so left outside of dictionary diff --git a/src/PaperBee/daily_posting.py b/src/PaperBee/daily_posting.py index 31054db..6ad47e5 100644 --- a/src/PaperBee/daily_posting.py +++ b/src/PaperBee/daily_posting.py @@ -34,25 +34,54 @@ async def daily_papers_search( - Zulip response - Mattermost response """ - root_dir, query, query_biorxiv, query_pubmed_arxiv = validate_configuration(config) + root_dir, query, query_biorxiv, query_pubmed_arxiv = validate_configuration( + config) slack_args = validate_platform_args(config, "SLACK") zulip_args = validate_platform_args(config, "ZULIP") telegram_args = validate_platform_args(config, "TELEGRAM") mattermost_args = validate_platform_args(config, "MATTERMOST") + basecamp_args = validate_platform_args(config, "BASECAMP") if telegram_args == {}: - telegram_args = {"bot_token": "", "channel_id": "", "is_posting_on": False} + telegram_args = { + "bot_token": "", + "channel_id": "", + "is_posting_on": False + } if zulip_args == {}: - zulip_args = {"prc": "", "stream": "", "topic": "", "is_posting_on": False} + zulip_args = { + "prc": "", + "stream": "", + "topic": "", + "is_posting_on": False + } if slack_args == {}: slack_args = {"bot_token": "", "channel_id": "", "is_posting_on": False} if mattermost_args == {}: - mattermost_args = {"bot_token": "", "channel_id": "", "is_posting_on": False} + mattermost_args = { + "url": "", + "token": "", + "team": "", + "channel": "", + "is_posting_on": False + } + + if basecamp_args == {}: + basecamp_args = { + "account_id": "", + "client_id": "", + "client_secret": "", + "user_agent": "", + "bucket_id": "", + "board_id": "", + "is_posting_on": False + } llm_filtering = config.get("LLM_FILTERING", False) if llm_filtering: - filtering_prompt, LLM_PROVIDER, LANGUAGE_MODEL, OPENAI_API_KEY = validate_llm_args(config, root_dir) + filtering_prompt, LLM_PROVIDER, LANGUAGE_MODEL, OPENAI_API_KEY = validate_llm_args( + config, root_dir) else: filtering_prompt = "" LLM_PROVIDER = "" @@ -85,16 +114,32 @@ async def daily_papers_search( mattermost_token=mattermost_args["token"], mattermost_team=mattermost_args["team"], mattermost_channel=mattermost_args["channel"], + basecamp_client_id=basecamp_args["client_id"], + basecamp_client_secret=basecamp_args["client_secret"], + basecamp_account_id=basecamp_args["account_id"], + basecamp_user_agent=basecamp_args["user_agent"], + basecamp_bucket_id=basecamp_args["bucket_id"], + basecamp_board_id=basecamp_args["board_id"], + basecamp_access_token=basecamp_args["access_token"], + basecamp_refresh_token=basecamp_args["refresh_token"], databases=databases, ) - papers, response_slack, response_telegram, response_zulip, response_mattermost = await finder.run_daily( + ( + papers, + response_slack, + response_telegram, + response_zulip, + response_mattermost, + response_basecamp, + ) = await finder.run_daily( post_to_slack=slack_args["is_posting_on"], post_to_telegram=telegram_args["is_posting_on"], post_to_zulip=zulip_args["is_posting_on"], post_to_mattermost=mattermost_args["is_posting_on"], + post_to_basecamp=basecamp_args["is_posting_on"], ) - return papers, response_slack, response_telegram, response_zulip, response_mattermost + return papers, response_slack, response_telegram, response_zulip, response_mattermost, response_basecamp def main() -> None: @@ -102,7 +147,9 @@ def main() -> None: CLI entry point for PaperBee, supporting subcommands like 'post'. """ parser = argparse.ArgumentParser(description="PaperBee CLI") - subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands") + subparsers = parser.add_subparsers(dest="command", + required=True, + help="Available commands") # Subcommand: post post_parser = subparsers.add_parser("post", help="Post daily papers") @@ -120,26 +167,27 @@ def main() -> None: post_parser.add_argument( "--since", type=int, - help="Filter out papers if published before the specified number of days ago.", + help= + "Filter out papers if published before the specified number of days ago.", ) post_parser.add_argument( "--databases", nargs="+", type=str, - help="Specify any combination of databases to search among the available ones 'pubmed','arxiv', and 'biorxiv'(e.g., ['pubmed', 'arxiv']).", + help= + "Specify any combination of databases to search among the available ones 'pubmed','arxiv', and 'biorxiv'(e.g., ['pubmed', 'arxiv']).", ) args = parser.parse_args() # Dispatch to the appropriate subcommand if args.command == "post": config = load_config(args.config) - papers, _, _, _, _ = asyncio.run( + papers, _, _, _, _, _ = asyncio.run( daily_papers_search( config, interactive=args.interactive, since=args.since, databases=args.databases, - ) - ) + )) print("Papers found:") print(papers) diff --git a/src/PaperBee/papers/basecamp_papers_formatter.py b/src/PaperBee/papers/basecamp_papers_formatter.py index 32bdd01..82f06a6 100644 --- a/src/PaperBee/papers/basecamp_papers_formatter.py +++ b/src/PaperBee/papers/basecamp_papers_formatter.py @@ -24,22 +24,25 @@ class BasecampPaperPublisher: LAUNCHPAD_TOKEN_URL = "https://launchpad.37signals.com/authorization/token" API_BASE = "https://3.basecampapi.com" - def __init__(self, - account_id: str, - client_id: str, - client_secret: str, - user_agent: str, - logger: Logger, - bucket_id: str, - board_id: str, - access_token: Optional[str] = None, - refresh_token: Optional[str] = None): + def __init__( + self, + logger: Logger, + account_id: str, + client_id: str, + client_secret: str, + user_agent: str, + bucket_id: str, + board_id: str, + access_token: str, + refresh_token: str, + ): self.account_id = account_id self.client_id = client_id self.client_secret = client_secret self.user_agent = user_agent self.logger = logger - + self.bucket_id = bucket_id + self.board_id = board_id self.access_token = access_token self.refresh_token = refresh_token self._access_expires_at = 0 # epoch seconds when token expires (if known) @@ -51,8 +54,6 @@ def __init__(self, "Accept": "application/json" }) - # user-agent: MyApp (yourname@example.com)' \ - # ---------------------------- # Authentication helpers # ---------------------------- @@ -83,8 +84,7 @@ def _refresh_access_token(self) -> None: resp.raise_for_status() payload = resp.json() self.access_token = payload.get("access_token") - # some endpoints return 'expires_in' seconds; store expiry timestamp if provided - expires_in = payload.get("expires_in") or payload.get("expires") + expires_in = payload.get("expires_in") if expires_in: self._access_expires_at = time.time() + int(expires_in) # if the server returned a new refresh_token, update it @@ -145,8 +145,7 @@ def _escape_html(text: str) -> str: def build_message( self, - papers: List[str], - preprints: List[str], + papers: List[List[str]], ) -> str: """ Build a simple HTML body for a Basecamp Message's `content` field. @@ -155,33 +154,58 @@ def build_message( parts = [] parts.append( f"

Good morning ☕ Here are today's papers!

") - parts.append("
") - if preprints: - parts.append("

Preprints


") - else: - parts.append("

No preprints today.


") - - if papers: - parts.append("

Papers

") - else: - parts.append("

No papers today.

") + parts.append("

Papers

") parts.append("
") parts.append('

Posted automatically by paperbee

') return "".join(parts) + #example: ['10.1101/2025.09.10.674954', '2025-09-17', '2025-09-16', 'TRUE', 'Differentiation hierarchy in adult B cell acute lymphoblastic leukemia at clonal resolution', '', None, 'https://doi.org/10.1101/2025.09.10.674954'] + # ---------------------------- # Publish # ---------------------------- - def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: + + @staticmethod + def format_papers( + papers_list: List[List[str]],) -> Tuple[List[str], List[str]]: + """ + Splits and formats papers into preprints and regular papers for Mattermost. + Args: + papers_list: List of paper records. + Returns: + Tuple of (papers, preprints) as formatted strings. + """ + papers = [] + preprints = [] + for idx, paper in enumerate(papers_list): + if not isinstance(paper, list) or len(paper) < 6: + print( + f"Warning: Skipping invalid paper at index {idx}: {paper}") + continue + emoji = "✏️" if paper[3] == "TRUE" else "🗞️" + title = paper[4] + link = paper[-1] + if not isinstance(title, str) or not isinstance(link, str): + print( + f"Warning: Skipping paper with invalid title or link at index {idx}: {paper}" + ) + continue + formatted_paper = f"{emoji} [{title}]({link})" + if paper[3] == "TRUE": + preprints.append(formatted_paper) + else: + papers.append(formatted_paper) + return papers, preprints + + async def publish_papers(self, + papers_list: List[List[str]]) -> Dict[str, Any]: """ Find project + board, and create a Message. Returns the created message JSON on success. @@ -214,18 +238,17 @@ def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: # if not subject: # subject = f"Papers — {today or ''}".strip() - papers, preprints = self.format_papers(papers_list) - content_html = self.build_message(papers, preprints) + #papers, preprints = self.format_papers(papers_list) + content_html = self.build_message(papers_list) body = { - "subject": "New subject here", + "subject": "Hello world!", "content": content_html, "status": "active" } url = f"{self.API_BASE}/{self.account_id}/buckets/{self.bucket_id}/message_boards/{self.board_id}/messages.json" - - # self.session already has the headers, I think we don't need to pass them again + #self.session already has the headers, I think we don't need to pass them again r = self._session.post(url, json=body) if r.status_code not in (200, 201): self.logger.error("Failed to create message: %s %s", r.status_code, @@ -233,3 +256,4 @@ def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: r.raise_for_status() self.logger.info("Posted message to Basecamp board") return r.json() + #return body diff --git a/src/PaperBee/papers/papers_finder.py b/src/PaperBee/papers/papers_finder.py index 350a268..3b01324 100644 --- a/src/PaperBee/papers/papers_finder.py +++ b/src/PaperBee/papers/papers_finder.py @@ -17,6 +17,7 @@ from .telegram_papers_formatter import TelegramPaperPublisher from .utils import ArticlesProcessor, PubMedClient from .zulip_papers_formatter import ZulipPaperPublisher +from .basecamp_papers_formatter import BasecampPaperPublisher class PapersFinder: @@ -77,6 +78,14 @@ def __init__( mattermost_token: str = "", mattermost_team: str = "", mattermost_channel: str = "", + basecamp_account_id: str = "", + basecamp_client_id: str = "", + basecamp_client_secret: str = "", + basecamp_user_agent: str = "", + basecamp_bucket_id: str = "", + basecamp_board_id: str = "", + basecamp_access_token: str = "", + basecamp_refresh_token: str = "", ncbi_api_key: str = "", databases: Optional[List[str]] = None, ) -> None: @@ -84,7 +93,8 @@ def __init__( # dates self.today: date = date.today() self.today_str: str = self.today.strftime("%Y-%m-%d") - self.yesterday: date = self.today - timedelta(days=since if since is not None else 1) + self.yesterday: date = self.today - timedelta( + days=since if since is not None else 1) self.yesterday_str: str = self.yesterday.strftime("%Y-%m-%d") self.until: date = self.today self.since: date = self.yesterday @@ -102,12 +112,15 @@ def __init__( self.spreadsheet_id: str = spreadsheet_id self.sheet_name: str = sheet_name # Query and search files - self.query_biorxiv: Optional[str] = query_biorxiv if query_biorxiv else None + self.query_biorxiv: Optional[ + str] = query_biorxiv if query_biorxiv else None self.query_pub_arx: Optional[str] = query_pubmed_arxiv self.query: Optional[str] = query if query else None self.search_file: str = os.path.join(root_dir, f"{self.today_str}.json") - self.search_file_biorxiv: str = os.path.join(root_dir, f"{self.today_str}_biorxiv.json") - self.search_file_pub_arx: str = os.path.join(root_dir, f"{self.today_str}_pub_arx.json") + self.search_file_biorxiv: str = os.path.join( + root_dir, f"{self.today_str}_biorxiv.json") + self.search_file_pub_arx: str = os.path.join( + root_dir, f"{self.today_str}_pub_arx.json") # Filter self.interactive_filtering: bool = interactive self.llm_filtering: bool = llm_filtering @@ -127,6 +140,14 @@ def __init__( self.mattermost_token: str = mattermost_token self.mattermost_team: str = mattermost_team self.mattermost_channel: str = mattermost_channel + self.basecamp_account_id: str = basecamp_account_id + self.basecamp_client_id: str = basecamp_client_id + self.basecamp_client_secret: str = basecamp_client_secret + self.basecamp_user_agent: str = basecamp_user_agent + self.basecamp_bucket_id: str = basecamp_bucket_id + self.basecamp_board_id: str = basecamp_board_id + self.basecamp_access_token: str = basecamp_access_token + self.basecamp_refresh_token: str = basecamp_refresh_token # Logger self.logger = Logger("PapersFinder") # NCBI API @@ -154,7 +175,8 @@ def find_and_process_papers(self) -> pd.DataFrame: verbose=False, ) with open(self.search_file) as papers_file: - articles_dict: List[Dict[str, Any]] = json.load(papers_file)["papers"] + articles_dict: List[Dict[str, Any]] = json.load( + papers_file)["papers"] articles = list(articles_dict) else: if not self.query_biorxiv or not self.query_pub_arx: @@ -169,7 +191,8 @@ def find_and_process_papers(self) -> pd.DataFrame: self.limit, self.limit_per_database, [ - database for database in self.databases if database != "biorxiv" + database for database in self.databases + if database != "biorxiv" ], # Biorxiv requires a different query verbose=False, ) @@ -185,22 +208,28 @@ def find_and_process_papers(self) -> pd.DataFrame: verbose=False, ) with open(self.search_file_pub_arx) as papers_file: - articles_pub_arx_dict: List[Dict[str, Any]] = json.load(papers_file)["papers"] + articles_pub_arx_dict: List[Dict[str, Any]] = json.load( + papers_file)["papers"] with open(self.search_file_biorxiv) as papers_file: - articles_biorxiv_dict: List[Dict[str, Any]] = json.load(papers_file)["papers"] + articles_biorxiv_dict: List[Dict[str, Any]] = json.load( + papers_file)["papers"] articles = articles_pub_arx_dict + articles_biorxiv_dict doi_extractor = PubMedClient() for article in tqdm(articles): if "PubMed" in article["databases"]: - doi = doi_extractor.get_doi_from_title(article["title"], ncbi_api_key=self.ncbi_api_key) + doi = doi_extractor.get_doi_from_title( + article["title"], ncbi_api_key=self.ncbi_api_key) article["url"] = f"https://doi.org/{doi}" if doi else None else: article["url"] = next( - (s for s in article["urls"] if s.startswith("https://doi.org")), + (s for s in article["urls"] + if s.startswith("https://doi.org")), None, ) - articles = [article for article in articles if article.get("url") is not None] + articles = [ + article for article in articles if article.get("url") is not None + ] processor = ArticlesProcessor(articles, self.today_str) processed_articles = processor.articles self.logger.info(f"Found {len(processed_articles)} articles.") @@ -214,16 +243,22 @@ def find_and_process_papers(self) -> pd.DataFrame: OPENAI_API_KEY=self.OPENAI_API_KEY, ) processed_articles = llm_filter.filter_articles() - self.logger.info(f"Filtered down to {len(processed_articles)} articles using LLM.") + self.logger.info( + f"Filtered down to {len(processed_articles)} articles using LLM." + ) if self.interactive_filtering: cli = InteractiveCLIFilter(processed_articles) processed_articles = cli.filter_articles() - self.logger.info(f"Filtered down to {len(processed_articles)} articles manually.") + self.logger.info( + f"Filtered down to {len(processed_articles)} articles manually." + ) return processed_articles - def update_google_sheet(self, processed_articles: pd.DataFrame, row: int = 2) -> List[List[Any]]: + def update_google_sheet(self, + processed_articles: pd.DataFrame, + row: int = 2) -> List[List[Any]]: """ Updates the Google Sheet with the processed articles that are not already listed. @@ -237,18 +272,25 @@ def update_google_sheet(self, processed_articles: pd.DataFrame, row: int = 2) -> spreadsheet_id=self.spreadsheet_id, credentials_json_path=self.google_credentials_json, ) - gsheet_cache = gsheet_updater.read_sheet_data(sheet_name=self.sheet_name) + gsheet_cache = gsheet_updater.read_sheet_data( + sheet_name=self.sheet_name) + if gsheet_cache: published_dois = [article["DOI"] for article in gsheet_cache] - processed_articles_filtered = processed_articles[~processed_articles["DOI"].isin(published_dois)] + processed_articles_filtered = processed_articles[ + ~processed_articles["DOI"].isin(published_dois)] else: # Sheet is empty (the moment of deployment) processed_articles_filtered = processed_articles - row_data = [list(row) for row in processed_articles_filtered.values.tolist()] + row_data = [ + list(row) for row in processed_articles_filtered.values.tolist() + ] if row_data: - gsheet_updater.insert_rows(sheet_name=self.sheet_name, rows_data=row_data, row=row) + gsheet_updater.insert_rows(sheet_name=self.sheet_name, + rows_data=row_data, + row=row) return row_data def post_paper_to_slack(self, papers: List[List[str]]) -> Any: @@ -263,10 +305,10 @@ def post_paper_to_slack(self, papers: List[List[str]]) -> Any: Logger("SlackPaperPublisher"), channel_id=self.slack_channel_id, ) - papers_pub, preprints = self.slack_publisher.format_papers_for_slack(papers) + papers_pub, preprints = self.slack_publisher.format_papers_for_slack( + papers) response = self.slack_publisher.publish_papers_to_slack( - papers_pub, preprints, self.today_str, self.spreadsheet_id - ) + papers_pub, preprints, self.today_str, self.spreadsheet_id) return response async def post_paper_to_telegram(self, papers: List[List[str]]) -> Any: @@ -283,7 +325,8 @@ async def post_paper_to_telegram(self, papers: List[List[str]]) -> Any: ) papers_pub, preprints = telegram_publisher.format_papers(papers) - response = await telegram_publisher.publish_papers(papers_pub, preprints, self.today_str, self.spreadsheet_id) + response = await telegram_publisher.publish_papers( + papers_pub, preprints, self.today_str, self.spreadsheet_id) return response async def post_paper_to_zulip(self, papers: List[List[str]]) -> Any: @@ -302,8 +345,7 @@ async def post_paper_to_zulip(self, papers: List[List[str]]) -> Any: papers_pub, preprints = zulip_publisher.format_papers_for_zulip(papers) response = await zulip_publisher.publish_papers_to_zulip( - papers_pub, preprints, self.today_str, self.spreadsheet_id - ) + papers_pub, preprints, self.today_str, self.spreadsheet_id) return response async def post_paper_to_mattermost(self, papers: List[List[str]]) -> Any: @@ -323,28 +365,56 @@ async def post_paper_to_mattermost(self, papers: List[List[str]]) -> Any: response = await mattermost_publisher.publish_papers(papers) return response + async def post_paper_to_basecamp(self, papers: List[List[str]]) -> Any: + """ + Posts the papers to Basecamp. + + Args: + papers (List[str]): List of papers to post to Basecamp. + """ + basecamp_publisher = BasecampPaperPublisher( + Logger("BasecampPaperPublisher"), + account_id=self.basecamp_account_id, + client_id=self.basecamp_client_id, + client_secret=self.basecamp_client_secret, + user_agent=self.basecamp_user_agent, + bucket_id=self.basecamp_bucket_id, + board_id=self.basecamp_board_id, + access_token=self.basecamp_access_token, + refresh_token=self.basecamp_refresh_token, + ) + response = await basecamp_publisher.publish_papers(papers) + return response + def cleanup_files(self) -> None: """ Deletes the search result files from the previous day to keep the directory clean. """ - yesterday_file = os.path.join(self.root_dir, f"{self.yesterday_str}.json") + yesterday_file = os.path.join(self.root_dir, + f"{self.yesterday_str}.json") if os.path.exists(yesterday_file): os.remove(yesterday_file) print(f"Deleted yesterday's file: {yesterday_file}") else: print(f"File not found, no deletion needed for: {yesterday_file}") - yesterday_file_biorxiv = os.path.join(self.root_dir, f"{self.yesterday_str}_biorxiv.json") + yesterday_file_biorxiv = os.path.join( + self.root_dir, f"{self.yesterday_str}_biorxiv.json") if os.path.exists(yesterday_file_biorxiv): os.remove(yesterday_file_biorxiv) print(f"Deleted yesterday's file: {yesterday_file_biorxiv}") else: - print(f"File not found, no deletion needed for: {yesterday_file_biorxiv}") - yesterday_file_pub_arx = os.path.join(self.root_dir, f"{self.yesterday_str}_pub_arx.json") + print( + f"File not found, no deletion needed for: {yesterday_file_biorxiv}" + ) + yesterday_file_pub_arx = os.path.join( + self.root_dir, f"{self.yesterday_str}_pub_arx.json") if os.path.exists(yesterday_file_pub_arx): os.remove(yesterday_file_pub_arx) print(f"Deleted yesterday's file: {yesterday_file_pub_arx}") else: - print(f"File not found, no deletion needed for: {yesterday_file_pub_arx}") + print( + f"File not found, no deletion needed for: {yesterday_file_pub_arx}" + ) async def run_daily( self, @@ -352,6 +422,7 @@ async def run_daily( post_to_telegram: bool = False, post_to_zulip: bool = False, post_to_mattermost: bool = False, + post_to_basecamp: bool = False, ) -> Tuple[List[List[Any]], Any | None, Any | None, Any | None, Any | None]: """ The main method to orchestrate finding, processing, and updating papers in a Google Sheet on a daily schedule. @@ -366,12 +437,15 @@ async def run_daily( Tuple[List[List[Any]], Any]: The papers posted and the response from the posting method. """ processed_articles = self.find_and_process_papers() + print("processed_articles", processed_articles, flush=True) papers = self.update_google_sheet(processed_articles) + print("papers_googlesheet", papers, flush=True) response_slack = None response_telegram = None response_zulip = None response_mattermost = None + response_basecamp = None if post_to_slack: response_slack = self.post_paper_to_slack(papers) @@ -385,11 +459,15 @@ async def run_daily( if post_to_mattermost: response_mattermost = await self.post_paper_to_mattermost(papers) + if post_to_basecamp: + response_basecamp = await self.post_paper_to_basecamp(papers) + self.cleanup_files() - return papers, response_slack, response_telegram, response_zulip, response_mattermost + return papers, response_slack, response_telegram, response_zulip, response_mattermost, response_basecamp - def send_csv(self, user_id: str, user_query: str) -> Tuple[pd.DataFrame, Any]: + def send_csv(self, user_id: str, + user_query: str) -> Tuple[pd.DataFrame, Any]: """ Paired with search_articles_command listener, send the articles' list as csv file in the channel where it was requested. diff --git a/src/PaperBee/papers/validate_inputs.py b/src/PaperBee/papers/validate_inputs.py index 5b3e222..caa0eb5 100644 --- a/src/PaperBee/papers/validate_inputs.py +++ b/src/PaperBee/papers/validate_inputs.py @@ -3,8 +3,7 @@ def validate_configuration( - config: dict, -) -> Tuple[str, Optional[str], Optional[str], Optional[str]]: + config: dict,) -> Tuple[str, Optional[str], Optional[str], Optional[str]]: """ Validate the root directory, google and NCBI credentials, and the query files. @@ -46,7 +45,9 @@ def validate_platform_args(config: dict, platform: str) -> dict[str, Any]: raise ValueError(e) if platform_args.get("is_posting_on", False): - empty_args = [param for param in platform_args if not platform_args[param]] + empty_args = [ + param for param in platform_args if not platform_args[param] + ] if empty_args: e = f"Missing required config params for {platform}: {', '.join(empty_args)}" raise ValueError(e) From 74fa7b21e32b9e2213e5de466c96811f8975f906 Mon Sep 17 00:00:00 2001 From: Rodrigo Gonzalez Laiz Date: Thu, 18 Sep 2025 11:00:50 +0200 Subject: [PATCH 3/5] pre-commit --- src/PaperBee/daily_posting.py | 42 +++------ .../papers/basecamp_papers_formatter.py | 68 +++++--------- src/PaperBee/papers/papers_finder.py | 94 ++++++------------- src/PaperBee/papers/validate_inputs.py | 7 +- 4 files changed, 71 insertions(+), 140 deletions(-) diff --git a/src/PaperBee/daily_posting.py b/src/PaperBee/daily_posting.py index 6ad47e5..8c266a1 100644 --- a/src/PaperBee/daily_posting.py +++ b/src/PaperBee/daily_posting.py @@ -34,8 +34,7 @@ async def daily_papers_search( - Zulip response - Mattermost response """ - root_dir, query, query_biorxiv, query_pubmed_arxiv = validate_configuration( - config) + root_dir, query, query_biorxiv, query_pubmed_arxiv = validate_configuration(config) slack_args = validate_platform_args(config, "SLACK") zulip_args = validate_platform_args(config, "ZULIP") @@ -44,28 +43,13 @@ async def daily_papers_search( basecamp_args = validate_platform_args(config, "BASECAMP") if telegram_args == {}: - telegram_args = { - "bot_token": "", - "channel_id": "", - "is_posting_on": False - } + telegram_args = {"bot_token": "", "channel_id": "", "is_posting_on": False} if zulip_args == {}: - zulip_args = { - "prc": "", - "stream": "", - "topic": "", - "is_posting_on": False - } + zulip_args = {"prc": "", "stream": "", "topic": "", "is_posting_on": False} if slack_args == {}: slack_args = {"bot_token": "", "channel_id": "", "is_posting_on": False} if mattermost_args == {}: - mattermost_args = { - "url": "", - "token": "", - "team": "", - "channel": "", - "is_posting_on": False - } + mattermost_args = {"url": "", "token": "", "team": "", "channel": "", "is_posting_on": False} if basecamp_args == {}: basecamp_args = { @@ -75,13 +59,12 @@ async def daily_papers_search( "user_agent": "", "bucket_id": "", "board_id": "", - "is_posting_on": False + "is_posting_on": False, } llm_filtering = config.get("LLM_FILTERING", False) if llm_filtering: - filtering_prompt, LLM_PROVIDER, LANGUAGE_MODEL, OPENAI_API_KEY = validate_llm_args( - config, root_dir) + filtering_prompt, LLM_PROVIDER, LANGUAGE_MODEL, OPENAI_API_KEY = validate_llm_args(config, root_dir) else: filtering_prompt = "" LLM_PROVIDER = "" @@ -147,9 +130,7 @@ def main() -> None: CLI entry point for PaperBee, supporting subcommands like 'post'. """ parser = argparse.ArgumentParser(description="PaperBee CLI") - subparsers = parser.add_subparsers(dest="command", - required=True, - help="Available commands") + subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands") # Subcommand: post post_parser = subparsers.add_parser("post", help="Post daily papers") @@ -167,15 +148,13 @@ def main() -> None: post_parser.add_argument( "--since", type=int, - help= - "Filter out papers if published before the specified number of days ago.", + help="Filter out papers if published before the specified number of days ago.", ) post_parser.add_argument( "--databases", nargs="+", type=str, - help= - "Specify any combination of databases to search among the available ones 'pubmed','arxiv', and 'biorxiv'(e.g., ['pubmed', 'arxiv']).", + help="Specify any combination of databases to search among the available ones 'pubmed','arxiv', and 'biorxiv'(e.g., ['pubmed', 'arxiv']).", ) args = parser.parse_args() @@ -188,6 +167,7 @@ def main() -> None: interactive=args.interactive, since=args.since, databases=args.databases, - )) + ) + ) print("Papers found:") print(papers) diff --git a/src/PaperBee/papers/basecamp_papers_formatter.py b/src/PaperBee/papers/basecamp_papers_formatter.py index 82f06a6..e45ab39 100644 --- a/src/PaperBee/papers/basecamp_papers_formatter.py +++ b/src/PaperBee/papers/basecamp_papers_formatter.py @@ -1,9 +1,9 @@ -import time -import requests import html -from typing import List, Optional, Tuple, Dict, Any -import pandas as pd +import time from logging import Logger +from typing import Any, Dict, List, Tuple + +import requests # Example: pip install requests @@ -21,7 +21,7 @@ class BasecampPaperPublisher: logger: logging.Logger instance. """ - LAUNCHPAD_TOKEN_URL = "https://launchpad.37signals.com/authorization/token" + LAUNCHPAD_AUTH_URL = "https://launchpad.37signals.com/authorization/token" API_BASE = "https://3.basecampapi.com" def __init__( @@ -49,10 +49,7 @@ def __init__( # small session for connection pooling self._session = requests.Session() - self._session.headers.update({ - "User-Agent": user_agent, - "Accept": "application/json" - }) + self._session.headers.update({"User-Agent": user_agent, "Accept": "application/json"}) # ---------------------------- # Authentication helpers @@ -66,8 +63,8 @@ def _ensure_access_token(self) -> None: def _refresh_access_token(self) -> None: """Refresh access token using refresh_token.""" if not self.refresh_token: - raise RuntimeError( - "No refresh_token available to refresh access token.") + msg = "No refresh_token available." + raise RuntimeError(msg) data = { "type": "refresh", # community examples use this type for refresh @@ -75,12 +72,9 @@ def _refresh_access_token(self) -> None: "client_secret": self.client_secret, "refresh_token": self.refresh_token, } - resp = requests.post(self.LAUNCHPAD_TOKEN_URL, - data=data, - headers={"User-Agent": self.user_agent}) + resp = requests.post(self.LAUNCHPAD_AUTH_URL, data=data, headers={"User-Agent": self.user_agent}, timeout=30) if resp.status_code != 200: - self.logger.error("Failed to refresh Basecamp token: %s %s", - resp.status_code, resp.text) + self.logger.error("Failed to refresh Basecamp token: %s %s", resp.status_code, resp.text) resp.raise_for_status() payload = resp.json() self.access_token = payload.get("access_token") @@ -94,7 +88,7 @@ def _refresh_access_token(self) -> None: # update session auth header self._session.headers.update({ "Authorization": f"Bearer {self.access_token}", - "Content-Type": "application/json; charset=utf-8" + "Content-Type": "application/json; charset=utf-8", }) # ---------------------------- @@ -152,21 +146,19 @@ def build_message( Basecamp uses HTML rich text for message content. """ parts = [] - parts.append( - f"

Good morning ☕ Here are today's papers!

") + parts.append("

Good morning ☕ Here are today's papers!

") parts.append("

Papers

") parts.append("
") - parts.append('

Posted automatically by paperbee

') + parts.append("

Posted automatically by paperbee

") return "".join(parts) - #example: ['10.1101/2025.09.10.674954', '2025-09-17', '2025-09-16', 'TRUE', 'Differentiation hierarchy in adult B cell acute lymphoblastic leukemia at clonal resolution', '', None, 'https://doi.org/10.1101/2025.09.10.674954'] + # example: ['10.1101/2025.09.10.674954', '2025-09-17', '2025-09-16', 'TRUE', 'Differentiation hierarchy in adult B cell acute lymphoblastic leukemia at clonal resolution', '', None, 'https://doi.org/10.1101/2025.09.10.674954'] # ---------------------------- # Publish @@ -174,7 +166,8 @@ def build_message( @staticmethod def format_papers( - papers_list: List[List[str]],) -> Tuple[List[str], List[str]]: + papers_list: List[List[str]], + ) -> Tuple[List[str], List[str]]: """ Splits and formats papers into preprints and regular papers for Mattermost. Args: @@ -186,16 +179,13 @@ def format_papers( preprints = [] for idx, paper in enumerate(papers_list): if not isinstance(paper, list) or len(paper) < 6: - print( - f"Warning: Skipping invalid paper at index {idx}: {paper}") + print(f"Warning: Skipping invalid paper at index {idx}: {paper}") continue emoji = "✏️" if paper[3] == "TRUE" else "🗞️" title = paper[4] link = paper[-1] if not isinstance(title, str) or not isinstance(link, str): - print( - f"Warning: Skipping paper with invalid title or link at index {idx}: {paper}" - ) + print(f"Warning: Skipping paper with invalid title or link at index {idx}: {paper}") continue formatted_paper = f"{emoji} [{title}]({link})" if paper[3] == "TRUE": @@ -204,9 +194,8 @@ def format_papers( papers.append(formatted_paper) return papers, preprints - async def publish_papers(self, - papers_list: List[List[str]]) -> Dict[str, Any]: - """ + async def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: + """ Find project + board, and create a Message. Returns the created message JSON on success. """ @@ -238,22 +227,17 @@ async def publish_papers(self, # if not subject: # subject = f"Papers — {today or ''}".strip() - #papers, preprints = self.format_papers(papers_list) + # papers, preprints = self.format_papers(papers_list) content_html = self.build_message(papers_list) - body = { - "subject": "Hello world!", - "content": content_html, - "status": "active" - } + body = {"subject": "Hello world!", "content": content_html, "status": "active"} url = f"{self.API_BASE}/{self.account_id}/buckets/{self.bucket_id}/message_boards/{self.board_id}/messages.json" - #self.session already has the headers, I think we don't need to pass them again + # self.session already has the headers, I think we don't need to pass them again r = self._session.post(url, json=body) if r.status_code not in (200, 201): - self.logger.error("Failed to create message: %s %s", r.status_code, - r.text) + self.logger.error("Failed to create message: %s %s", r.status_code, r.text) r.raise_for_status() self.logger.info("Posted message to Basecamp board") return r.json() - #return body + # return body diff --git a/src/PaperBee/papers/papers_finder.py b/src/PaperBee/papers/papers_finder.py index 3b01324..c49b7fd 100644 --- a/src/PaperBee/papers/papers_finder.py +++ b/src/PaperBee/papers/papers_finder.py @@ -9,6 +9,7 @@ from slack_sdk import WebClient from tqdm import tqdm +from .basecamp_papers_formatter import BasecampPaperPublisher from .cli import InteractiveCLIFilter from .google_sheet import GoogleSheetsUpdater from .llm_filtering import LLMFilter @@ -17,7 +18,6 @@ from .telegram_papers_formatter import TelegramPaperPublisher from .utils import ArticlesProcessor, PubMedClient from .zulip_papers_formatter import ZulipPaperPublisher -from .basecamp_papers_formatter import BasecampPaperPublisher class PapersFinder: @@ -93,8 +93,7 @@ def __init__( # dates self.today: date = date.today() self.today_str: str = self.today.strftime("%Y-%m-%d") - self.yesterday: date = self.today - timedelta( - days=since if since is not None else 1) + self.yesterday: date = self.today - timedelta(days=since if since is not None else 1) self.yesterday_str: str = self.yesterday.strftime("%Y-%m-%d") self.until: date = self.today self.since: date = self.yesterday @@ -112,15 +111,12 @@ def __init__( self.spreadsheet_id: str = spreadsheet_id self.sheet_name: str = sheet_name # Query and search files - self.query_biorxiv: Optional[ - str] = query_biorxiv if query_biorxiv else None + self.query_biorxiv: Optional[str] = query_biorxiv if query_biorxiv else None self.query_pub_arx: Optional[str] = query_pubmed_arxiv self.query: Optional[str] = query if query else None self.search_file: str = os.path.join(root_dir, f"{self.today_str}.json") - self.search_file_biorxiv: str = os.path.join( - root_dir, f"{self.today_str}_biorxiv.json") - self.search_file_pub_arx: str = os.path.join( - root_dir, f"{self.today_str}_pub_arx.json") + self.search_file_biorxiv: str = os.path.join(root_dir, f"{self.today_str}_biorxiv.json") + self.search_file_pub_arx: str = os.path.join(root_dir, f"{self.today_str}_pub_arx.json") # Filter self.interactive_filtering: bool = interactive self.llm_filtering: bool = llm_filtering @@ -175,8 +171,7 @@ def find_and_process_papers(self) -> pd.DataFrame: verbose=False, ) with open(self.search_file) as papers_file: - articles_dict: List[Dict[str, Any]] = json.load( - papers_file)["papers"] + articles_dict: List[Dict[str, Any]] = json.load(papers_file)["papers"] articles = list(articles_dict) else: if not self.query_biorxiv or not self.query_pub_arx: @@ -191,8 +186,7 @@ def find_and_process_papers(self) -> pd.DataFrame: self.limit, self.limit_per_database, [ - database for database in self.databases - if database != "biorxiv" + database for database in self.databases if database != "biorxiv" ], # Biorxiv requires a different query verbose=False, ) @@ -208,28 +202,22 @@ def find_and_process_papers(self) -> pd.DataFrame: verbose=False, ) with open(self.search_file_pub_arx) as papers_file: - articles_pub_arx_dict: List[Dict[str, Any]] = json.load( - papers_file)["papers"] + articles_pub_arx_dict: List[Dict[str, Any]] = json.load(papers_file)["papers"] with open(self.search_file_biorxiv) as papers_file: - articles_biorxiv_dict: List[Dict[str, Any]] = json.load( - papers_file)["papers"] + articles_biorxiv_dict: List[Dict[str, Any]] = json.load(papers_file)["papers"] articles = articles_pub_arx_dict + articles_biorxiv_dict doi_extractor = PubMedClient() for article in tqdm(articles): if "PubMed" in article["databases"]: - doi = doi_extractor.get_doi_from_title( - article["title"], ncbi_api_key=self.ncbi_api_key) + doi = doi_extractor.get_doi_from_title(article["title"], ncbi_api_key=self.ncbi_api_key) article["url"] = f"https://doi.org/{doi}" if doi else None else: article["url"] = next( - (s for s in article["urls"] - if s.startswith("https://doi.org")), + (s for s in article["urls"] if s.startswith("https://doi.org")), None, ) - articles = [ - article for article in articles if article.get("url") is not None - ] + articles = [article for article in articles if article.get("url") is not None] processor = ArticlesProcessor(articles, self.today_str) processed_articles = processor.articles self.logger.info(f"Found {len(processed_articles)} articles.") @@ -243,22 +231,16 @@ def find_and_process_papers(self) -> pd.DataFrame: OPENAI_API_KEY=self.OPENAI_API_KEY, ) processed_articles = llm_filter.filter_articles() - self.logger.info( - f"Filtered down to {len(processed_articles)} articles using LLM." - ) + self.logger.info(f"Filtered down to {len(processed_articles)} articles using LLM.") if self.interactive_filtering: cli = InteractiveCLIFilter(processed_articles) processed_articles = cli.filter_articles() - self.logger.info( - f"Filtered down to {len(processed_articles)} articles manually." - ) + self.logger.info(f"Filtered down to {len(processed_articles)} articles manually.") return processed_articles - def update_google_sheet(self, - processed_articles: pd.DataFrame, - row: int = 2) -> List[List[Any]]: + def update_google_sheet(self, processed_articles: pd.DataFrame, row: int = 2) -> List[List[Any]]: """ Updates the Google Sheet with the processed articles that are not already listed. @@ -272,25 +254,19 @@ def update_google_sheet(self, spreadsheet_id=self.spreadsheet_id, credentials_json_path=self.google_credentials_json, ) - gsheet_cache = gsheet_updater.read_sheet_data( - sheet_name=self.sheet_name) + gsheet_cache = gsheet_updater.read_sheet_data(sheet_name=self.sheet_name) if gsheet_cache: published_dois = [article["DOI"] for article in gsheet_cache] - processed_articles_filtered = processed_articles[ - ~processed_articles["DOI"].isin(published_dois)] + processed_articles_filtered = processed_articles[~processed_articles["DOI"].isin(published_dois)] else: # Sheet is empty (the moment of deployment) processed_articles_filtered = processed_articles - row_data = [ - list(row) for row in processed_articles_filtered.values.tolist() - ] + row_data = [list(row) for row in processed_articles_filtered.values.tolist()] if row_data: - gsheet_updater.insert_rows(sheet_name=self.sheet_name, - rows_data=row_data, - row=row) + gsheet_updater.insert_rows(sheet_name=self.sheet_name, rows_data=row_data, row=row) return row_data def post_paper_to_slack(self, papers: List[List[str]]) -> Any: @@ -305,10 +281,10 @@ def post_paper_to_slack(self, papers: List[List[str]]) -> Any: Logger("SlackPaperPublisher"), channel_id=self.slack_channel_id, ) - papers_pub, preprints = self.slack_publisher.format_papers_for_slack( - papers) + papers_pub, preprints = self.slack_publisher.format_papers_for_slack(papers) response = self.slack_publisher.publish_papers_to_slack( - papers_pub, preprints, self.today_str, self.spreadsheet_id) + papers_pub, preprints, self.today_str, self.spreadsheet_id + ) return response async def post_paper_to_telegram(self, papers: List[List[str]]) -> Any: @@ -325,8 +301,7 @@ async def post_paper_to_telegram(self, papers: List[List[str]]) -> Any: ) papers_pub, preprints = telegram_publisher.format_papers(papers) - response = await telegram_publisher.publish_papers( - papers_pub, preprints, self.today_str, self.spreadsheet_id) + response = await telegram_publisher.publish_papers(papers_pub, preprints, self.today_str, self.spreadsheet_id) return response async def post_paper_to_zulip(self, papers: List[List[str]]) -> Any: @@ -345,7 +320,8 @@ async def post_paper_to_zulip(self, papers: List[List[str]]) -> Any: papers_pub, preprints = zulip_publisher.format_papers_for_zulip(papers) response = await zulip_publisher.publish_papers_to_zulip( - papers_pub, preprints, self.today_str, self.spreadsheet_id) + papers_pub, preprints, self.today_str, self.spreadsheet_id + ) return response async def post_paper_to_mattermost(self, papers: List[List[str]]) -> Any: @@ -390,31 +366,24 @@ def cleanup_files(self) -> None: """ Deletes the search result files from the previous day to keep the directory clean. """ - yesterday_file = os.path.join(self.root_dir, - f"{self.yesterday_str}.json") + yesterday_file = os.path.join(self.root_dir, f"{self.yesterday_str}.json") if os.path.exists(yesterday_file): os.remove(yesterday_file) print(f"Deleted yesterday's file: {yesterday_file}") else: print(f"File not found, no deletion needed for: {yesterday_file}") - yesterday_file_biorxiv = os.path.join( - self.root_dir, f"{self.yesterday_str}_biorxiv.json") + yesterday_file_biorxiv = os.path.join(self.root_dir, f"{self.yesterday_str}_biorxiv.json") if os.path.exists(yesterday_file_biorxiv): os.remove(yesterday_file_biorxiv) print(f"Deleted yesterday's file: {yesterday_file_biorxiv}") else: - print( - f"File not found, no deletion needed for: {yesterday_file_biorxiv}" - ) - yesterday_file_pub_arx = os.path.join( - self.root_dir, f"{self.yesterday_str}_pub_arx.json") + print(f"File not found, no deletion needed for: {yesterday_file_biorxiv}") + yesterday_file_pub_arx = os.path.join(self.root_dir, f"{self.yesterday_str}_pub_arx.json") if os.path.exists(yesterday_file_pub_arx): os.remove(yesterday_file_pub_arx) print(f"Deleted yesterday's file: {yesterday_file_pub_arx}") else: - print( - f"File not found, no deletion needed for: {yesterday_file_pub_arx}" - ) + print(f"File not found, no deletion needed for: {yesterday_file_pub_arx}") async def run_daily( self, @@ -466,8 +435,7 @@ async def run_daily( return papers, response_slack, response_telegram, response_zulip, response_mattermost, response_basecamp - def send_csv(self, user_id: str, - user_query: str) -> Tuple[pd.DataFrame, Any]: + def send_csv(self, user_id: str, user_query: str) -> Tuple[pd.DataFrame, Any]: """ Paired with search_articles_command listener, send the articles' list as csv file in the channel where it was requested. diff --git a/src/PaperBee/papers/validate_inputs.py b/src/PaperBee/papers/validate_inputs.py index caa0eb5..5b3e222 100644 --- a/src/PaperBee/papers/validate_inputs.py +++ b/src/PaperBee/papers/validate_inputs.py @@ -3,7 +3,8 @@ def validate_configuration( - config: dict,) -> Tuple[str, Optional[str], Optional[str], Optional[str]]: + config: dict, +) -> Tuple[str, Optional[str], Optional[str], Optional[str]]: """ Validate the root directory, google and NCBI credentials, and the query files. @@ -45,9 +46,7 @@ def validate_platform_args(config: dict, platform: str) -> dict[str, Any]: raise ValueError(e) if platform_args.get("is_posting_on", False): - empty_args = [ - param for param in platform_args if not platform_args[param] - ] + empty_args = [param for param in platform_args if not platform_args[param]] if empty_args: e = f"Missing required config params for {platform}: {', '.join(empty_args)}" raise ValueError(e) From 69cd056b47a8e59a12e9695b7838aa7fedb2ad1c Mon Sep 17 00:00:00 2001 From: Rodrigo Gonzalez Laiz Date: Thu, 18 Sep 2025 11:10:53 +0200 Subject: [PATCH 4/5] update config template --- files/config_template.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/files/config_template.yml b/files/config_template.yml index 7a462de..a42ff49 100644 --- a/files/config_template.yml +++ b/files/config_template.yml @@ -52,16 +52,17 @@ MATTERMOST: # Basecamp configuration BASECAMP: is_posting_on: true - account_id: "your-basecamp-account-id" + access_token: "your-basecamp-access-token" + refresh_token: "your-basecamp-refresh-token" client_id: "your-basecamp-client-id" client_secret: "your-basecamp-client-secret" + account_id: "your-basecamp-account-id" user_agent: "your-basecamp-user-agent" bucket_id: "your-basecamp-bucket-id" board_id: "your-basecamp-board-id" - -#SLACK_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary -#TELEGRAM_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary -#MATTERMOST_TEST_CHANNEL_ID: "your-mattermost-test-channel-id" # not required so left outside of dictionary -#GOOGLE_TEST_SPREADSHEET_ID: "your-google-test-spreadsheet-id" # not required so left outside of dictionary +SLACK_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary +TELEGRAM_TEST_CHANNEL_ID: "your-slack-test-channel-id" # not required so left outside of dictionary +MATTERMOST_TEST_CHANNEL_ID: "your-mattermost-test-channel-id" # not required so left outside of dictionary +GOOGLE_TEST_SPREADSHEET_ID: "your-google-test-spreadsheet-id" # not required so left outside of dictionary \ No newline at end of file From dfc50ca1be987156ea4da919e60b2031e26b7c0b Mon Sep 17 00:00:00 2001 From: Rodrigo Gonzalez Laiz Date: Thu, 18 Sep 2025 12:00:18 +0200 Subject: [PATCH 5/5] clean up --- .../papers/basecamp_papers_formatter.py | 134 ++---------------- src/PaperBee/papers/papers_finder.py | 2 - 2 files changed, 14 insertions(+), 122 deletions(-) diff --git a/src/PaperBee/papers/basecamp_papers_formatter.py b/src/PaperBee/papers/basecamp_papers_formatter.py index e45ab39..c897f3a 100644 --- a/src/PaperBee/papers/basecamp_papers_formatter.py +++ b/src/PaperBee/papers/basecamp_papers_formatter.py @@ -1,24 +1,26 @@ import html import time +from datetime import datetime from logging import Logger -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List import requests -# Example: pip install requests - class BasecampPaperPublisher: """ Publish papers (from a spreadsheet) to a Basecamp Message Board. Args: + logger: logging.Logger instance. account_id: Basecamp account id (the {ACCOUNT_ID} in URLs). client_id, client_secret: OAuth credentials (from launchpad.37signals.com). + client_secret: OAuth credentials (from launchpad.37signals.com). + user_agent: string identifying your app (required by Basecamp). + bucket_id: Basecamp bucket id (the {BUCKET_ID} in URLs). + board_id: Basecamp board id (the {BOARD_ID} in URLs). access_token: optional initial access token. refresh_token: refresh token (used to obtain new access tokens). - user_agent: string identifying your app (required by Basecamp). - logger: logging.Logger instance. """ LAUNCHPAD_AUTH_URL = "https://launchpad.37signals.com/authorization/token" @@ -51,9 +53,6 @@ def __init__( self._session = requests.Session() self._session.headers.update({"User-Agent": user_agent, "Accept": "application/json"}) - # ---------------------------- - # Authentication helpers - # ---------------------------- def _ensure_access_token(self) -> None: """Ensure we have a valid access token; refresh if needed.""" if not self.access_token or time.time() >= self._access_expires_at - 30: @@ -66,6 +65,7 @@ def _refresh_access_token(self) -> None: msg = "No refresh_token available." raise RuntimeError(msg) + # NOTE(Rodrigo): {"type": "refresh"} as in https://github.com/basecamp/api/blob/master/sections/authentication.md data = { "type": "refresh", # community examples use this type for refresh "client_id": self.client_id, @@ -91,48 +91,6 @@ def _refresh_access_token(self) -> None: "Content-Type": "application/json; charset=utf-8", }) - # ---------------------------- - # Discovery helpers - # ---------------------------- - # def list_projects(self) -> List[Dict[str, Any]]: - # """Return list of projects for the account.""" - # self._ensure_access_token() - # url = f"{self.API_BASE}/{self.account_id}/projects.json" - # r = self._session.get(url) - # r.raise_for_status() - # return r.json() - - # def find_project_by_name(self, - # project_name: str) -> Optional[Dict[str, Any]]: - # """Find the project dict by (case-insensitive) name. Returns the first match or None.""" - # projects = self.list_projects() - # for p in projects: - # if p.get("name") and p["name"].lower() == project_name.lower(): - # return p - # return None - - # def list_message_boards(self, - # project_bucket_id: str) -> List[Dict[str, Any]]: - # """List message boards under a project's bucket id.""" - # self._ensure_access_token() - # url = f"{self.API_BASE}/{self.account_id}/buckets/{project_bucket_id}/message_boards.json" - # r = self._session.get(url) - # r.raise_for_status() - # return r.json() - - # def find_message_board(self, project_bucket_id: str, - # board_name: str) -> Optional[ - # Dict[str, Any], - # ]: - # boards = self.list_message_boards(project_bucket_id) - # for b in boards: - # if b.get("name") and b["name"].lower() == board_name.lower(): - # return b - # return None - - # ---------------------------- - # Formatting / content helpers - # ---------------------------- @staticmethod def _escape_html(text: str) -> str: return html.escape(text) @@ -147,90 +105,27 @@ def build_message( """ parts = [] parts.append("

Good morning ☕ Here are today's papers!

") - parts.append("

Papers

    ") - + # parts.append("

    Papers

      ") + parts.append("") - parts.append("
      ") - parts.append("

      Posted automatically by paperbee

      ") + # parts.append("
      ") + # parts.append("

      Posted automatically by paperbee

      ") return "".join(parts) # example: ['10.1101/2025.09.10.674954', '2025-09-17', '2025-09-16', 'TRUE', 'Differentiation hierarchy in adult B cell acute lymphoblastic leukemia at clonal resolution', '', None, 'https://doi.org/10.1101/2025.09.10.674954'] - # ---------------------------- - # Publish - # ---------------------------- - - @staticmethod - def format_papers( - papers_list: List[List[str]], - ) -> Tuple[List[str], List[str]]: - """ - Splits and formats papers into preprints and regular papers for Mattermost. - Args: - papers_list: List of paper records. - Returns: - Tuple of (papers, preprints) as formatted strings. - """ - papers = [] - preprints = [] - for idx, paper in enumerate(papers_list): - if not isinstance(paper, list) or len(paper) < 6: - print(f"Warning: Skipping invalid paper at index {idx}: {paper}") - continue - emoji = "✏️" if paper[3] == "TRUE" else "🗞️" - title = paper[4] - link = paper[-1] - if not isinstance(title, str) or not isinstance(link, str): - print(f"Warning: Skipping paper with invalid title or link at index {idx}: {paper}") - continue - formatted_paper = f"{emoji} [{title}]({link})" - if paper[3] == "TRUE": - preprints.append(formatted_paper) - else: - papers.append(formatted_paper) - return papers, preprints - async def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: - """ - Find project + board, and create a Message. - Returns the created message JSON on success. - """ self._ensure_access_token() - # # find project -> get its bucket id - # project = self.find_project_by_name(project_name) - # if not project: - # raise RuntimeError( - # f"Project named '{project_name}' not found in account {self.account_id}." - # ) - - # # NOTE: in many Basecamp examples the project's "id" is its bucket id; some responses include 'id' or 'bucket' keys. - # # We'll try to use project['id'] as the bucket id. - # bucket_id = str( - # project.get("id") or project.get("bucket", {}).get("id")) - # if not bucket_id: - # raise RuntimeError( - # "Could not determine project bucket id from project metadata.") - - # board = self.find_message_board(bucket_id, board_name) - # if not board: - # raise RuntimeError( - # f"Message board '{board_name}' not found under project '{project_name}'." - # ) - - # board_id = str(board["id"]) - - # if not subject: - # subject = f"Papers — {today or ''}".strip() - # papers, preprints = self.format_papers(papers_list) content_html = self.build_message(papers_list) - body = {"subject": "Hello world!", "content": content_html, "status": "active"} + today_str = datetime.now().strftime("%d-%m-%Y") + body = {"subject": f"Papers from {today_str}", "content": content_html, "status": "active"} url = f"{self.API_BASE}/{self.account_id}/buckets/{self.bucket_id}/message_boards/{self.board_id}/messages.json" # self.session already has the headers, I think we don't need to pass them again @@ -240,4 +135,3 @@ async def publish_papers(self, papers_list: List[List[str]]) -> Dict[str, Any]: r.raise_for_status() self.logger.info("Posted message to Basecamp board") return r.json() - # return body diff --git a/src/PaperBee/papers/papers_finder.py b/src/PaperBee/papers/papers_finder.py index c49b7fd..fa7c1d9 100644 --- a/src/PaperBee/papers/papers_finder.py +++ b/src/PaperBee/papers/papers_finder.py @@ -406,9 +406,7 @@ async def run_daily( Tuple[List[List[Any]], Any]: The papers posted and the response from the posting method. """ processed_articles = self.find_and_process_papers() - print("processed_articles", processed_articles, flush=True) papers = self.update_google_sheet(processed_articles) - print("papers_googlesheet", papers, flush=True) response_slack = None response_telegram = None