Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
379 changes: 379 additions & 0 deletions osf/metrics/es8_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,379 @@
from __future__ import annotations
import datetime
import enum
import elasticsearch8.dsl as esdsl
import elasticsearch_metrics.imps.elastic8 as djelme

from osf.metrics.utils import YearMonth


###
# custom dsl fields

class YearmonthField(esdsl.Date):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, format='strict_year_month')

def deserialize(self, data):
if isinstance(data, int):
# elasticsearch stores dates in milliseconds since the unix epoch
_as_datetime = datetime.datetime.fromtimestamp(data // 1000)
return YearMonth.from_date(_as_datetime)
elif data is None:
return None
try:
return YearMonth.from_any(data)
except ValueError:
raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth')

def serialize(self, data):
if isinstance(data, str):
return data
elif isinstance(data, YearMonth):
return str(data)
elif isinstance(data, (datetime.datetime, datetime.date)):
return str(YearMonth.from_date(data))
elif data is None:
return None
else:
raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM')


###
# inner objects for events

route_prefix_analyzer = esdsl.analyzer(
'route_prefix_analyzer',
tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'),
)


class PageviewInfo(esdsl.InnerDoc):
"""PageviewInfo

for CountedAuthUsage generated by viewing a web page
"""
# fields that should be provided
referer_url: str
page_url: str
page_title: str
route_name: str = esdsl.mapped_field(esdsl.Keyword(
fields={
'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer),
},
))

# fields autofilled from the above (see `_autofill_fields`)
page_path: str
referer_domain: str
hour_of_day: str


###
# Event records

class OsfCountedUsageRecord(djelme.CountedUsageRecord):
'''

inherited fields:
platform_iri: str
database_iri: str
item_iri: str
sessionhour_id: str
within_iris: list[str]
'''
# osf-specific fields
item_osfid: str
item_type: str
item_public: bool
user_is_authenticated: bool
action_labels: list[str]
pageview_info: PageviewInfo


class CountedAuthUsage(djelme.CountedUsageRecord):
"""CountedAuthUsage

Something was used! Let's quickly take note of that and
move on, then come back later to query/analyze/investigate.

Aim to support a COUNTER-style reporting api
(see https://cop5.projectcounter.org/en/5.0.2/)
"""

# where noted, fields correspond to defined terms from COUNTER
# https://cop5.projectcounter.org/en/5.0.2/appendices/a-glossary-of-terms.html
platform_iri: str
provider_id: str
session_id: str
item_guid: str
item_type: str
surrounding_guids: list[str]
item_public: bool
user_is_authenticated: bool
action_labels: list[str]
class ActionLabel(enum.Enum):
SEARCH = 'search' # counter:Search
VIEW = 'view' # counter:Investigation
DOWNLOAD = 'download' # counter:Request
WEB = 'web' # counter:Regular (aka "pageview")
API = 'api' # counter:TDM (aka "non-web api usage")
# TODO: count api usage, distinguish between web and non-web api requests

# pageviews get additional info to support the "node analytics" view
# (see `api.metrics.views.NodeAnalyticsQuery`)
pageview_info: PageviewInfo

class Meta:
dynamic = djelme.MetaField('strict')
source = djelme.MetaField(enabled=True)


class BasePreprintMetrics(djelme.CountedUsageRecord):
'''
inherited fields:
platform_iri: str
database_iri: str
item_iri: str
sessionhour_id: str
within_iris: list[str]
'''
count: int
provider_id: str
user_id: str
preprint_id: str
version: str
path: str

class Index:
settings = {
'number_of_shards': 1,
'number_of_replicas': 1,
'refresh_interval': '1s',
}

class Meta:
abstract = True
source = djelme.MetaField(enabled=True)


class PreprintView(BasePreprintMetrics):
pass


class PreprintDownload(BasePreprintMetrics):
pass


class RegistriesModerationMetrics(djelme.CountedUsageRecord):
registration_id: str
provider_id: str
trigger: str
from_state: str
to_state: str
user_id: str
comment: str

class Index:
settings = {
'number_of_shards': 1,
'number_of_replicas': 1,
'refresh_interval': '1s',
}

class Meta:
source = djelme.MetaField(enabled=True)


###
# Reusable inner objects for reports

class RunningTotal(esdsl.InnerDoc):
total: int
total_daily: int


class FileRunningTotals(esdsl.InnerDoc):
total: int
public: int
private: int
total_daily: int
public_daily: int
private_daily: int


class NodeRunningTotals(esdsl.InnerDoc):
total: int
total_excluding_spam: int
public: int
private: int
total_daily: int
total_daily_excluding_spam: int
public_daily: int
private_daily: int


class RegistrationRunningTotals(esdsl.InnerDoc):
total: int
public: int
embargoed: int
embargoed_v2: int
withdrawn: int
total_daily: int
public_daily: int
embargoed_daily: int
embargoed_v2_daily: int
withdrawn_daily: int


class UsageByStorageAddon(esdsl.InnerDoc):
addon_shortname: str
enabled_usersettings: RunningTotal
linked_usersettings: RunningTotal
deleted_usersettings: RunningTotal
usersetting_links: RunningTotal
connected_nodesettings: RunningTotal
disconnected_nodesettings: RunningTotal
deleted_nodesettings: RunningTotal


###
# Cyclic reports


class StorageAddonUsage(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
usage_by_addon: UsageByStorageAddon


class DownloadCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
daily_file_downloads: int


class InstitutionSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',)

institution_id: str
institution_name: str
users: RunningTotal
nodes: NodeRunningTotals
projects: NodeRunningTotals
registered_nodes: RegistrationRunningTotals
registered_projects: RegistrationRunningTotals


class NewUserDomainReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',)

domain_name: str
domain_name: int


class NodeSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
nodes: NodeRunningTotals
projects: NodeRunningTotals
registered_nodes: RegistrationRunningTotals
registered_projects: RegistrationRunningTotals


class OsfstorageFileCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
files: FileRunningTotals


class PreprintSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',)
provider_key: str
preprint_count: int


class UserSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY):
active: int
deactivated: int
merged: int
new_users_daily: int
new_users_with_institution_daily: int
unconfirmed: int


class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY):
node_confirmed_spam: int
node_confirmed_ham: int
node_flagged: int
registration_confirmed_spam: int
registration_confirmed_ham: int
registration_flagged: int
preprint_confirmed_spam: int
preprint_confirmed_ham: int
preprint_flagged: int
user_marked_as_spam: int
user_marked_as_ham: int


class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY):
UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',)
institution_id: str
# user info:
user_id: str
user_name: str
department_name: str
month_last_login = YearmonthField()
month_last_active = YearmonthField()
account_creation_date = YearmonthField()
orcid_id: str
# counts:
public_project_count: int
private_project_count: int
public_registration_count: int
embargoed_registration_count: int
published_preprint_count: int
public_file_count: int = esdsl.mapped_field(esdsl.Long())
storage_byte_count: int = esdsl.mapped_field(esdsl.Long())


class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY):
UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', )
institution_id: str
user_count: int
public_project_count: int
private_project_count: int
public_registration_count: int
embargoed_registration_count: int
published_preprint_count: int
storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
public_file_count: int = esdsl.mapped_field(esdsl.Long())
monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long())
monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long())


class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY):
UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid')

# where noted, fields are meant to correspond to defined terms from COUNTER
# https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
# https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
item_osfid: str
item_type: list[str] # counter:Data-Type
provider_id: list[str] # counter:Database(?)
platform_iri: list[str] # counter:Platform

# view counts include views on components or files contained by this item
view_count: int = esdsl.mapped_field(esdsl.Long())
view_session_count: int = esdsl.mapped_field(esdsl.Long())
cumulative_view_count: int = esdsl.mapped_field(esdsl.Long())
cumulative_view_session_count: int = esdsl.mapped_field(esdsl.Long())

# download counts of this item only (not including contained components or files)
download_count: int = esdsl.mapped_field(esdsl.Long())
download_session_count: int = esdsl.mapped_field(esdsl.Long())
cumulative_download_count: int = esdsl.mapped_field(esdsl.Long())
cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long())


class PrivateSpamMetricsReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY):
node_oopspam_flagged: int
node_oopspam_hammed: int
node_akismet_flagged: int
node_akismet_hammed: int
preprint_oopspam_flagged: int
preprint_oopspam_hammed: int
preprint_akismet_flagged: int
preprint_akismet_hammed: int
Loading