From 674f963f388689fcd67bbbcbad26468e09cc86b5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 25 Mar 2026 10:29:17 -0400 Subject: [PATCH 1/2] wip: es8 djelme records (migration targets) --- osf/metrics/es8_metrics.py | 221 +++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 osf/metrics/es8_metrics.py diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py new file mode 100644 index 00000000000..ec20215449e --- /dev/null +++ b/osf/metrics/es8_metrics.py @@ -0,0 +1,221 @@ +from __future__ import annotations +import datetime + +import elasticsearch8.dsl as esdsl +import elasticsearch_metrics.imps.elastic8 as djelme + +from osf.metrics.utils import YearMonth + + +### +# custom dsl fields + +class YearmonthField(esdsl.Date): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, format='strict_year_month') + + def deserialize(self, data): + if isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) + elif data is None: + return None + try: + return YearMonth.from_any(data) + except ValueError: + raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') + + def serialize(self, data): + if isinstance(data, str): + return data + elif isinstance(data, YearMonth): + return str(data) + elif isinstance(data, (datetime.datetime, datetime.date)): + return str(YearMonth.from_date(data)) + elif data is None: + return None + else: + raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') + + +### +# inner objects for events + +route_prefix_analyzer = esdsl.analyzer( + 'route_prefix_analyzer', + tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'), +) + + +class PageviewInfo(esdsl.InnerDoc): + """PageviewInfo + + for CountedAuthUsage generated by viewing a web page + """ + # fields that should be provided + referer_url: str + page_url: str + page_title: str + route_name: str = esdsl.mapped_field(esdsl.Keyword( + fields={ + 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), + }, + )) + + # fields autofilled from the above (see `_autofill_fields`) + page_path: str + referer_domain: str + hour_of_day: str + + +### +# Event records + +class OsfCountedUsageRecord(djelme.CountedUsageRecord): + ''' + + inherited fields: + platform_iri: str + database_iri: str + item_iri: str + sessionhour_id: str + within_iris: list[str] + ''' + # osf-specific fields + item_osfid: str + item_type: str + item_public: bool + user_is_authenticated: bool + action_labels: list[str] + pageview_info: PageviewInfo + + +### +# Reusable inner objects for reports + +class RunningTotal(esdsl.InnerDoc): + total: int + total_daily: int + + +class FileRunningTotals(esdsl.InnerDoc): + total: int + public: int + private: int + total_daily: int + public_daily: int + private_daily: int + + +class NodeRunningTotals(esdsl.InnerDoc): + total: int + total_excluding_spam: int + public: int + private: int + total_daily: int + total_daily_excluding_spam: int + public_daily: int + private_daily: int + + +class RegistrationRunningTotals(esdsl.InnerDoc): + total: int + public: int + embargoed: int + embargoed_v2: int + withdrawn: int + total_daily: int + public_daily: int + embargoed_daily: int + embargoed_v2_daily: int + withdrawn_daily: int + + +### +# Cyclic reports + + +class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + node_confirmed_spam: int + node_confirmed_ham: int + node_flagged: int + registration_confirmed_spam: int + registration_confirmed_ham: int + registration_flagged: int + preprint_confirmed_spam: int + preprint_confirmed_ham: int + preprint_flagged: int + user_marked_as_spam: int + user_marked_as_ham: int + + +class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + institution_id: str + # user info: + user_id: str + user_name: str + department_name: str + month_last_login = YearmonthField() + month_last_active = YearmonthField() + account_creation_date = YearmonthField() + orcid_id: str + # counts: + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + + +class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) + institution_id: str + user_count: int + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) + + +class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + + # where noted, fields are meant to correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + item_osfid: str + item_type: list[str] # counter:Data-Type + provider_id: list[str] # counter:Database(?) + platform_iri: list[str] # counter:Platform + + # view counts include views on components or files contained by this item + view_count: int = esdsl.mapped_field(esdsl.Long()) + view_session_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_view_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_view_session_count: int = esdsl.mapped_field(esdsl.Long()) + + # download counts of this item only (not including contained components or files) + download_count: int = esdsl.mapped_field(esdsl.Long()) + download_session_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_download_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) + + +class PrivateSpamMetricsReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + node_oopspam_flagged: int + node_oopspam_hammed: int + node_akismet_flagged: int + node_akismet_hammed: int + preprint_oopspam_flagged: int + preprint_oopspam_hammed: int + preprint_akismet_flagged: int + preprint_akismet_hammed: int From 2e73161b508a73e192ae3675f60ec05569502848 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 1 Apr 2026 00:52:14 +0300 Subject: [PATCH 2/2] add new metrics --- osf/metrics/es8_metrics.py | 166 ++++++++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 4 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index ec20215449e..4c1b2de4a2d 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -1,6 +1,6 @@ from __future__ import annotations import datetime - +import enum import elasticsearch8.dsl as esdsl import elasticsearch_metrics.imps.elastic8 as djelme @@ -91,6 +91,100 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): pageview_info: PageviewInfo +class CountedAuthUsage(djelme.CountedUsageRecord): + """CountedAuthUsage + + Something was used! Let's quickly take note of that and + move on, then come back later to query/analyze/investigate. + + Aim to support a COUNTER-style reporting api + (see https://cop5.projectcounter.org/en/5.0.2/) + """ + + # where noted, fields correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.0.2/appendices/a-glossary-of-terms.html + platform_iri: str + provider_id: str + session_id: str + item_guid: str + item_type: str + surrounding_guids: list[str] + item_public: bool + user_is_authenticated: bool + action_labels: list[str] + class ActionLabel(enum.Enum): + SEARCH = 'search' # counter:Search + VIEW = 'view' # counter:Investigation + DOWNLOAD = 'download' # counter:Request + WEB = 'web' # counter:Regular (aka "pageview") + API = 'api' # counter:TDM (aka "non-web api usage") + # TODO: count api usage, distinguish between web and non-web api requests + + # pageviews get additional info to support the "node analytics" view + # (see `api.metrics.views.NodeAnalyticsQuery`) + pageview_info: PageviewInfo + + class Meta: + dynamic = djelme.MetaField('strict') + source = djelme.MetaField(enabled=True) + + +class BasePreprintMetrics(djelme.CountedUsageRecord): + ''' + inherited fields: + platform_iri: str + database_iri: str + item_iri: str + sessionhour_id: str + within_iris: list[str] + ''' + count: int + provider_id: str + user_id: str + preprint_id: str + version: str + path: str + + class Index: + settings = { + 'number_of_shards': 1, + 'number_of_replicas': 1, + 'refresh_interval': '1s', + } + + class Meta: + abstract = True + source = djelme.MetaField(enabled=True) + + +class PreprintView(BasePreprintMetrics): + pass + + +class PreprintDownload(BasePreprintMetrics): + pass + + +class RegistriesModerationMetrics(djelme.CountedUsageRecord): + registration_id: str + provider_id: str + trigger: str + from_state: str + to_state: str + user_id: str + comment: str + + class Index: + settings = { + 'number_of_shards': 1, + 'number_of_replicas': 1, + 'refresh_interval': '1s', + } + + class Meta: + source = djelme.MetaField(enabled=True) + + ### # Reusable inner objects for reports @@ -132,10 +226,74 @@ class RegistrationRunningTotals(esdsl.InnerDoc): withdrawn_daily: int +class UsageByStorageAddon(esdsl.InnerDoc): + addon_shortname: str + enabled_usersettings: RunningTotal + linked_usersettings: RunningTotal + deleted_usersettings: RunningTotal + usersetting_links: RunningTotal + connected_nodesettings: RunningTotal + disconnected_nodesettings: RunningTotal + deleted_nodesettings: RunningTotal + + ### # Cyclic reports +class StorageAddonUsage(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + usage_by_addon: UsageByStorageAddon + + +class DownloadCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + daily_file_downloads: int + + +class InstitutionSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) + + institution_id: str + institution_name: str + users: RunningTotal + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + +class NewUserDomainReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) + + domain_name: str + domain_name: int + + +class NodeSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + +class OsfstorageFileCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + files: FileRunningTotals + + +class PreprintSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) + provider_key: str + preprint_count: int + + +class UserSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + active: int + deactivated: int + merged: int + new_users_daily: int + new_users_with_institution_daily: int + unconfirmed: int + + class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): node_confirmed_spam: int node_confirmed_ham: int @@ -151,7 +309,7 @@ class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): - # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) institution_id: str # user info: user_id: str @@ -172,7 +330,7 @@ class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHL class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) institution_id: str user_count: int public_project_count: int @@ -187,7 +345,7 @@ class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelm class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): - # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid') # where noted, fields are meant to correspond to defined terms from COUNTER # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html