-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathgithub_monitor.py
More file actions
executable file
·5108 lines (4228 loc) · 213 KB
/
github_monitor.py
File metadata and controls
executable file
·5108 lines (4228 loc) · 213 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Author: Michal Szymanski <misiektoja-github@rm-rf.ninja>
v2.4
OSINT tool implementing real-time tracking of GitHub users activities including profile and repositories changes:
https://github.com/misiektoja/github_monitor/
Python pip3 requirements:
PyGithub
requests
python-dateutil
pytz
tzlocal (optional)
python-dotenv (optional)
"""
VERSION = "2.4"
# ---------------------------
# CONFIGURATION SECTION START
# ---------------------------
CONFIG_BLOCK = """
# Get your GitHub personal access token (classic) by visiting:
# https://github.com/settings/apps
#
# Then go to: Personal access tokens -> Tokens (classic) -> Generate new token (classic)
#
# Provide the GITHUB_TOKEN secret using one of the following methods:
# - Pass it at runtime with -t / --github-token
# - Set it as an environment variable (e.g. export GITHUB_TOKEN=...)
# - Add it to ".env" file (GITHUB_TOKEN=...) for persistent use
# - Fallback: hard-code it in the code or config file
GITHUB_TOKEN = "your_github_classic_personal_access_token"
# The URL of the GitHub API
#
# For Public Web GitHub use the default: https://api.github.com
# For GitHub Enterprise change to: https://{your_hostname}/api/v3
#
# Can also be set using the -x flag
GITHUB_API_URL = "https://api.github.com"
# The base URL of the GitHub web interface
# Required to check if the profile is public or private
#
# For public GitHub use the default: https://github.com
# For GitHub Enterprise change to: https://{your_hostname}
GITHUB_HTML_URL = "https://github.com"
# SMTP settings for sending email notifications
# If left as-is, no notifications will be sent
#
# Provide the SMTP_PASSWORD secret using one of the following methods:
# - Set it as an environment variable (e.g. export SMTP_PASSWORD=...)
# - Add it to ".env" file (SMTP_PASSWORD=...) for persistent use
# Fallback:
# - Hard-code it in the code or config file
SMTP_HOST = "your_smtp_server_ssl"
SMTP_PORT = 587
SMTP_USER = "your_smtp_user"
SMTP_PASSWORD = "your_smtp_password"
SMTP_SSL = True
SENDER_EMAIL = "your_sender_email"
RECEIVER_EMAIL = "your_receiver_email"
# Whether to send an email when user's profile changes
# Can also be enabled via the -p flag
PROFILE_NOTIFICATION = False
# Whether to send an email when new GitHub events appear
# Can also be enabled via the -s flag
EVENT_NOTIFICATION = False
# Whether to send an email when user's repositories change (stargazers, watchers, forks, issues,
# PRs, description etc., except for update date)
# Requires TRACK_REPOS_CHANGES to be enabled
# Can also be enabled via the -q flag
REPO_NOTIFICATION = False
# Whether to send an email when user's repositories update date changes
# Can also be enabled via the -u flag
REPO_UPDATE_DATE_NOTIFICATION = False
# Whether to send an email when user's daily contributions count changes
# Requires TRACK_CONTRIB_CHANGES to be enabled
# Can also be enabled via the -y flag
CONTRIB_NOTIFICATION = False
# Whether to send an email on errors
# Can also be disabled via the -e flag
ERROR_NOTIFICATION = True
# How often to check for user profile changes / activities; in seconds
# Can also be set using the -c flag
GITHUB_CHECK_INTERVAL = 1800 # 30 mins
# Set your local time zone so that GitHub API timestamps are converted accordingly (e.g. 'Europe/Warsaw')
# Use this command to list all time zones supported by pytz:
# python3 -c "import pytz; print('\\n'.join(pytz.all_timezones))"
# If set to 'Auto', the tool will try to detect your local time zone automatically (requires tzlocal)
LOCAL_TIMEZONE = 'Auto'
# Events to monitor
# Use 'ALL' to monitor all available event types
EVENTS_TO_MONITOR = [
'ALL',
'PushEvent',
'PullRequestEvent',
'PullRequestReviewEvent',
'PullRequestReviewCommentEvent',
'IssueCommentEvent',
'IssuesEvent',
'CommitCommentEvent',
'CreateEvent',
'DeleteEvent',
'ForkEvent',
'PublicEvent',
'GollumEvent',
'MemberEvent',
'WatchEvent',
'ReleaseEvent',
'DeploymentEvent',
'CheckRunEvent',
'WorkflowRunEvent',
]
# Number of recent events to fetch when a change in the last event ID is detected
# Note: if more than EVENTS_NUMBER events occur between two checks,
# any events older than the most recent EVENTS_NUMBER will be missed
EVENTS_NUMBER = 30 # 1 page
# If True, track user's repository changes (changed stargazers, watchers, forks, description, update date etc.)
# Can also be enabled using the -j flag
TRACK_REPOS_CHANGES = False
# Repositories to monitor when TRACK_REPOS_CHANGES is enabled
# Use 'ALL' to monitor all repositories (default behavior)
# Use 'user/repo_name' format to monitor specific repositories for specific users
# If the current user matches the user in the list, that repository will be monitored
# Example: ['user1/repo1', 'user2/repo2', 'user1/repo3']
# Can also be set using the --repos flag (comma-separated repo names only, without user prefix)
# Example: --repos "repo1,repo2,repo3"
# Note: When using a specific list (not 'ALL'), newly created repositories will NOT be
# automatically monitored - only repositories explicitly listed here will be monitored.
REPOS_TO_MONITOR = ['ALL']
# If True, disable event monitoring
# Can also be disabled using the -k flag
DO_NOT_MONITOR_GITHUB_EVENTS = False
# If True, fetch all user repos (owned, forks, collaborations); otherwise, fetch only owned repos
GET_ALL_REPOS = False
# Alert about blocked (403 - TOS violation and 451 - DMCA block) repos in the console output (in monitoring mode)
# In listing mode (-r), blocked repos are always shown
BLOCKED_REPOS = False
# If True, track and log user's daily contributions count changes
# Can also be enabled using the -m flag
TRACK_CONTRIB_CHANGES = False
# How often to print a "liveness check" message to the output; in seconds
# Set to 0 to disable
LIVENESS_CHECK_INTERVAL = 43200 # 12 hours
# URL used to verify internet connectivity at startup
CHECK_INTERNET_URL = GITHUB_API_URL
# Timeout used when checking initial internet connectivity; in seconds
CHECK_INTERNET_TIMEOUT = 5
# CSV file to write new events & profile changes
# Can also be set using the -b flag
CSV_FILE = ""
# Location of the optional dotenv file which can keep secrets
# If not specified it will try to auto-search for .env files
# To disable auto-search, set this to the literal string "none"
# Can also be set using the --env-file flag
DOTENV_FILE = ""
# Base name for the log file. Output will be saved to github_monitor_<username>.log
# Can include a directory path to specify the location, e.g. ~/some_dir/github_monitor
GITHUB_LOGFILE = "github_monitor"
# Whether to disable logging to github_monitor_<username>.log
# Can also be disabled via the -d flag
DISABLE_LOGGING = False
# Width of main horizontal line
HORIZONTAL_LINE1 = 105
# Width of horizontal line for repositories list output
HORIZONTAL_LINE2 = 80
# Whether to clear the terminal screen after starting the tool
CLEAR_SCREEN = True
# Maximum number of times to retry a failed GitHub API/network call
NET_MAX_RETRIES = 5
# Base number of seconds to wait before each retry, multiplied by the attempt count
NET_BASE_BACKOFF_SEC = 5
# Value used by signal handlers increasing/decreasing profile/user activity check (GITHUB_CHECK_INTERVAL); in seconds
GITHUB_CHECK_SIGNAL_VALUE = 60 # 1 minute
"""
# -------------------------
# CONFIGURATION SECTION END
# -------------------------
# Default dummy values so linters shut up
# Do not change values below - modify them in the configuration section or config file instead
GITHUB_TOKEN = ""
GITHUB_API_URL = ""
GITHUB_HTML_URL = ""
SMTP_HOST = ""
SMTP_PORT = 0
SMTP_USER = ""
SMTP_PASSWORD = ""
SMTP_SSL = False
SENDER_EMAIL = ""
RECEIVER_EMAIL = ""
PROFILE_NOTIFICATION = False
EVENT_NOTIFICATION = False
REPO_NOTIFICATION = False
REPO_UPDATE_DATE_NOTIFICATION = False
CONTRIB_NOTIFICATION = False
ERROR_NOTIFICATION = False
GITHUB_CHECK_INTERVAL = 0
LOCAL_TIMEZONE = ""
EVENTS_TO_MONITOR = []
EVENTS_NUMBER = 0
TRACK_REPOS_CHANGES = False
REPOS_TO_MONITOR = []
DO_NOT_MONITOR_GITHUB_EVENTS = False
GET_ALL_REPOS = False
BLOCKED_REPOS = False
TRACK_CONTRIB_CHANGES = False
LIVENESS_CHECK_INTERVAL = 0
CHECK_INTERNET_URL = ""
CHECK_INTERNET_TIMEOUT = 0
CSV_FILE = ""
DOTENV_FILE = ""
GITHUB_LOGFILE = ""
DISABLE_LOGGING = False
HORIZONTAL_LINE1 = 0
HORIZONTAL_LINE2 = 0
CLEAR_SCREEN = False
NET_MAX_RETRIES = 0
NET_BASE_BACKOFF_SEC = 0
GITHUB_CHECK_SIGNAL_VALUE = 0
exec(CONFIG_BLOCK, globals())
# Default name for the optional config file
DEFAULT_CONFIG_FILENAME = "github_monitor.conf"
# List of secret keys to load from env/config
SECRET_KEYS = ("GITHUB_TOKEN", "SMTP_PASSWORD")
LIVENESS_CHECK_COUNTER = LIVENESS_CHECK_INTERVAL / GITHUB_CHECK_INTERVAL
stdout_bck = None
csvfieldnames = ['Date', 'Type', 'Name', 'Old', 'New']
CLI_CONFIG_PATH = None
# Maximum length for event body text (issue bodies, comment bodies, etc.) before truncation
# Text longer than this will be truncated with safe HTML tag closing
MAX_EVENT_BODY_LENGTH = 3500
# to solve the issue: 'SyntaxError: f-string expression part cannot include a backslash'
nl_ch = "\n"
import sys
if sys.version_info < (3, 10):
print("* Error: Python version 3.10 or higher required !")
sys.exit(1)
import time
import string
import os
from datetime import datetime, timezone, date
from dateutil import relativedelta
from dateutil.parser import isoparse
import calendar
import requests as req
import signal
import smtplib
import ssl
from email.header import Header
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import argparse
import csv
try:
import pytz
except ModuleNotFoundError:
raise SystemExit("Error: Couldn't find the pytz library !\n\nTo install it, run:\n pip3 install pytz\n\nOnce installed, re-run this tool")
try:
from tzlocal import get_localzone
except ImportError:
get_localzone = None
import platform
import re
import ipaddress
import html
try:
from github import Github, Auth, GithubException, UnknownObjectException
from github.GithubException import RateLimitExceededException
from github.GithubException import BadCredentialsException
except ModuleNotFoundError:
raise SystemExit("Error: Couldn't find the PyGitHub library !\n\nTo install it, run:\n pip3 install PyGithub\n\nOnce installed, re-run this tool. For more help, visit:\nhttps://github.com/PyGithub/PyGithub")
from itertools import islice
import textwrap
import urllib3
import socket
from typing import Any, Callable
import shutil
from pathlib import Path
from typing import Optional
import datetime as dt
import requests
NET_ERRORS = (
req.exceptions.RequestException,
urllib3.exceptions.HTTPError,
socket.gaierror,
GithubException,
)
# Logger class to output messages to stdout and log file
class Logger(object):
def __init__(self, filename):
self.terminal = sys.stdout
self.logfile = open(filename, "a", buffering=1, encoding="utf-8")
def write(self, message):
self.terminal.write(message)
self.logfile.write(message)
self.terminal.flush()
self.logfile.flush()
def flush(self):
pass
# Signal handler when user presses Ctrl+C
def signal_handler(sig, frame):
sys.stdout = stdout_bck
print('\n* You pressed Ctrl+C, tool is terminated.')
sys.exit(0)
# Checks internet connectivity
def check_internet(url=CHECK_INTERNET_URL, timeout=CHECK_INTERNET_TIMEOUT):
try:
_ = req.get(url, timeout=timeout)
return True
except req.RequestException as e:
print(f"* No connectivity, please check your network:\n\n{e}")
return False
# Clears the terminal screen
def clear_screen(enabled=True):
if not enabled:
return
try:
if platform.system() == 'Windows':
os.system('cls')
else:
os.system('clear')
except Exception:
print("* Cannot clear the screen contents")
# Converts absolute value of seconds to human readable format
def display_time(seconds, granularity=2):
intervals = (
('years', 31556952), # approximation
('months', 2629746), # approximation
('weeks', 604800), # 60 * 60 * 24 * 7
('days', 86400), # 60 * 60 * 24
('hours', 3600), # 60 * 60
('minutes', 60),
('seconds', 1),
)
result = []
if seconds > 0:
for name, count in intervals:
value = seconds // count
if value:
seconds -= value * count
if value == 1:
name = name.rstrip('s')
result.append(f"{value} {name}")
return ', '.join(result[:granularity])
else:
return '0 seconds'
# Calculates time span between two timestamps, accepts timestamp integers, floats and datetime objects
def calculate_timespan(timestamp1, timestamp2, show_weeks=True, show_hours=True, show_minutes=True, show_seconds=True, granularity=3):
result = []
intervals = ['years', 'months', 'weeks', 'days', 'hours', 'minutes', 'seconds']
ts1 = timestamp1
ts2 = timestamp2
if isinstance(timestamp1, str):
try:
timestamp1 = isoparse(timestamp1)
except Exception:
return ""
if isinstance(timestamp1, int):
dt1 = datetime.fromtimestamp(int(ts1), tz=timezone.utc)
elif isinstance(timestamp1, float):
ts1 = int(round(ts1))
dt1 = datetime.fromtimestamp(ts1, tz=timezone.utc)
elif isinstance(timestamp1, datetime):
dt1 = timestamp1
if dt1.tzinfo is None:
dt1 = pytz.utc.localize(dt1)
else:
dt1 = dt1.astimezone(pytz.utc)
ts1 = int(round(dt1.timestamp()))
else:
return ""
if isinstance(timestamp2, str):
try:
timestamp2 = isoparse(timestamp2)
except Exception:
return ""
if isinstance(timestamp2, int):
dt2 = datetime.fromtimestamp(int(ts2), tz=timezone.utc)
elif isinstance(timestamp2, float):
ts2 = int(round(ts2))
dt2 = datetime.fromtimestamp(ts2, tz=timezone.utc)
elif isinstance(timestamp2, datetime):
dt2 = timestamp2
if dt2.tzinfo is None:
dt2 = pytz.utc.localize(dt2)
else:
dt2 = dt2.astimezone(pytz.utc)
ts2 = int(round(dt2.timestamp()))
else:
return ""
if ts1 >= ts2:
ts_diff = ts1 - ts2
else:
ts_diff = ts2 - ts1
dt1, dt2 = dt2, dt1
if ts_diff > 0:
date_diff = relativedelta.relativedelta(dt1, dt2)
years = date_diff.years
months = date_diff.months
days_total = date_diff.days
if show_weeks:
weeks = days_total // 7
days = days_total % 7
else:
weeks = 0
days = days_total
hours = date_diff.hours if show_hours or ts_diff <= 86400 else 0
minutes = date_diff.minutes if show_minutes or ts_diff <= 3600 else 0
seconds = date_diff.seconds if show_seconds or ts_diff <= 60 else 0
date_list = [years, months, weeks, days, hours, minutes, seconds]
for index, interval in enumerate(date_list):
if interval > 0:
name = intervals[index]
if interval == 1:
name = name.rstrip('s')
result.append(f"{interval} {name}")
return ', '.join(result[:granularity])
else:
return '0 seconds'
# Sanitizes HTML content, preserving safe tags while removing dangerous ones
def sanitize_and_preserve_html(text, convert_line_breaks=True, repo_url=None):
if not text:
return ""
safe_tags = {
'details': ['open'],
'summary': [],
'ul': [],
'ol': [],
'li': [],
'a': ['href', 'title'],
'code': [],
'pre': [],
'p': [],
'br': [],
'strong': [],
'b': [],
'em': [],
'i': [],
's': [],
'strike': [],
'del': [],
'img': ['src', 'alt', 'title'],
'blockquote': [],
'hr': [],
}
code_blocks = []
code_block_pattern = r'```([\s\S]*?)```'
code_block_counter = 0
def replace_code_block(match):
nonlocal code_block_counter
code_content = match.group(1)
placeholder = f"__CODE_BLOCK_{code_block_counter}__"
code_blocks.append(('<pre><code>' + html.escape(code_content) + '</code></pre>', placeholder))
code_block_counter += 1
return placeholder
text = re.sub(code_block_pattern, replace_code_block, text)
# Pattern to match HTML tags including multiline (use [\s\S]*? to match any char including newlines)
tag_pattern = r'<(/)?([a-z][a-z0-9]*)([\s\S]*?)>'
def sanitize_tag(match):
closing = match.group(1) == '/'
tag_name = match.group(2).lower()
attrs_str = match.group(3) if match.group(3) else ''
if closing:
return f'</{tag_name}>' if tag_name in safe_tags else ''
if tag_name not in safe_tags:
return ''
allowed_attrs = safe_tags[tag_name]
if not allowed_attrs and attrs_str:
return f'<{tag_name}>'
attr_pattern = r'(\w+)=["\']([^"\']*)["\']'
safe_attrs = []
for attr_match in re.finditer(attr_pattern, attrs_str):
attr_name = attr_match.group(1).lower()
attr_value = attr_match.group(2)
if attr_name in allowed_attrs:
if attr_name == 'href' or attr_name == 'src':
if attr_value.startswith(('http://', 'https://', 'mailto:', '#')):
safe_attrs.append(f'{attr_name}="{html.escape(attr_value)}"')
else:
safe_attrs.append(f'{attr_name}="{html.escape(attr_value)}"')
if safe_attrs:
return f'<{tag_name} {" ".join(safe_attrs)}>'
else:
return f'<{tag_name}>'
sanitized = re.sub(tag_pattern, sanitize_tag, text, flags=re.IGNORECASE)
temp_markers = []
for idx, (code_html, placeholder) in enumerate(code_blocks):
temp_marker = f"__TEMP_CODE_{idx}__"
temp_markers.append((temp_marker, code_html))
sanitized = sanitized.replace(placeholder, temp_marker)
protected_tags = []
tag_counter = 0
valid_tag_pattern = r'</?[a-z][a-z0-9]*(?:\s+[^>]*)?>'
def protect_tag(match):
nonlocal tag_counter
protected_tags.append(match.group(0))
result = f"__PROTECTED_TAG_{tag_counter}__"
tag_counter += 1
return result
sanitized = re.sub(valid_tag_pattern, protect_tag, sanitized, flags=re.IGNORECASE)
sanitized = sanitized.replace('<', '<').replace('>', '>')
for idx, tag in enumerate(protected_tags):
sanitized = sanitized.replace(f"__PROTECTED_TAG_{idx}__", tag)
for temp_marker, code_html in temp_markers:
sanitized = sanitized.replace(temp_marker, code_html)
if convert_line_breaks:
lines = sanitized.split('\n')
result_lines = []
prev_was_block = False
prev_was_empty = False
for i, line in enumerate(lines):
stripped = line.strip()
is_block = bool(re.search(r'<(details|summary|ul|ol|li|pre|blockquote|hr|p)[\s>]', stripped, re.IGNORECASE))
if not stripped:
if not prev_was_empty and not prev_was_block:
result_lines.append('<br>')
prev_was_empty = True
prev_was_block = False
else:
if is_block:
result_lines.append(line)
prev_was_block = True
prev_was_empty = False
else:
if not prev_was_block and result_lines and not prev_was_empty:
result_lines.append('<br>')
result_lines.append(line)
prev_was_block = False
prev_was_empty = False
sanitized = ''.join(result_lines)
return sanitized
# Sanitizes a single HTML tag
def sanitize_single_html_tag(html_tag):
safe_tags = {
'details': ['open'],
'summary': [],
'ul': [],
'ol': [],
'li': [],
'a': ['href', 'title'],
'code': [],
'pre': [],
'p': [],
'br': [],
'strong': [],
'b': [],
'em': [],
'i': [],
's': [],
'strike': [],
'del': [],
'img': ['src', 'alt', 'title'],
'blockquote': [],
'hr': [],
}
# Pattern to match HTML tags including multiline (use [\s\S]*? to match any char including newlines)
tag_pattern = r'<(/)?([a-z][a-z0-9]*)([\s\S]*?)>'
match = re.match(tag_pattern, html_tag, re.IGNORECASE)
if not match:
return html.escape(html_tag)
closing = match.group(1) == '/'
tag_name = match.group(2).lower()
attrs_str = match.group(3) if match.group(3) else ''
if closing:
return f'</{tag_name}>' if tag_name in safe_tags else ''
if tag_name not in safe_tags:
return ''
allowed_attrs = safe_tags[tag_name]
if not allowed_attrs and attrs_str:
return f'<{tag_name}>'
# Extract attributes, handling whitespace/newlines before attribute names
attr_pattern = r'\s*(\w+)=["\']([^"\']*)["\']'
safe_attrs = []
for attr_match in re.finditer(attr_pattern, attrs_str):
attr_name = attr_match.group(1).lower()
attr_value = attr_match.group(2)
if attr_name in allowed_attrs:
if attr_name == 'href' or attr_name == 'src':
if attr_value.startswith(('http://', 'https://', 'mailto:', '#')):
safe_attrs.append(f'{attr_name}="{html.escape(attr_value)}"')
else:
safe_attrs.append(f'{attr_name}="{html.escape(attr_value)}"')
if safe_attrs:
return f'<{tag_name} {" ".join(safe_attrs)}>'
else:
return f'<{tag_name}>'
# Converts markdown text to HTML
def markdown_to_html(text, convert_line_breaks=True, repo_url=None):
if not text:
return ""
# Pattern to match HTML tags (both opening and closing), including those that span multiple lines
# Matches: <tagname...> or </tagname> with attributes that may span lines
html_tag_pattern = r'</?[a-z][a-z0-9]*(?:[\s\S]*?)>'
has_html = bool(re.search(html_tag_pattern, text, re.IGNORECASE))
# Protect code blocks first
code_blocks = []
code_block_pattern = r'```([\s\S]*?)```'
code_block_counter = 0
def replace_code_block(match):
nonlocal code_block_counter
code_content = match.group(1)
placeholder = f"__CODE_BLOCK_{code_block_counter}__"
code_blocks.append(('<pre><code>' + html.escape(code_content) + '</code></pre>', placeholder))
code_block_counter += 1
return placeholder
text = re.sub(code_block_pattern, replace_code_block, text)
# If HTML is present, protect HTML tags during markdown processing, then sanitize them
html_tags = []
if has_html:
html_tag_counter = 0
def protect_html_tag(match):
nonlocal html_tag_counter
html_tags.append(match.group(0))
# Use a placeholder that won't be processed by markdown (no underscores, asterisks, etc.)
result = f"PROTECTEDHTMLTAG{html_tag_counter}PROTECTED"
html_tag_counter += 1
return result
text = re.sub(html_tag_pattern, protect_html_tag, text, flags=re.IGNORECASE)
# Protect markdown link/image patterns from HTML escaping and italic processing
markdown_pattern_placeholders = []
markdown_pattern_counter = 0
def protect_markdown_pattern(match):
nonlocal markdown_pattern_counter
markdown_pattern_placeholders.append(match.group(0))
result = f"__MARKDOWN_PATTERN_{markdown_pattern_counter}__"
markdown_pattern_counter += 1
return result
# Protect image links, images, and links before HTML escaping
text = re.sub(r'\[!\[([^\]]*)\]\(([^\)]+)\)\]\(([^\)]+)\)', protect_markdown_pattern, text)
text = re.sub(r'!\[([^\]]*)\]\(([^\)]+)\)', protect_markdown_pattern, text)
text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', protect_markdown_pattern, text)
# Escape HTML (but code blocks, protected HTML tags, and markdown patterns are already protected)
html_text = html.escape(text)
# Restore code blocks
for code_html, placeholder in code_blocks:
html_text = html_text.replace(placeholder, code_html)
# Process block-level elements line by line
lines = html_text.split('\n')
processed_lines = []
in_list = False
list_type = None # 'ul' or 'ol'
list_items = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Skip empty lines for now (we'll add them back later)
if not stripped:
if in_list:
# Close current list
if list_type == 'ul':
processed_lines.append('<ul>' + ''.join(list_items) + '</ul>')
else:
processed_lines.append('<ol>' + ''.join(list_items) + '</ol>')
in_list = False
list_type = None
list_items = []
processed_lines.append('')
i += 1
continue
# Horizontal rules (must be at least 3 dashes/asterisks)
if re.match(r'^[-*_]{3,}$', stripped):
processed_lines.append('<hr>')
i += 1
continue
# Headers (# ## ### etc.)
header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
if header_match:
level = len(header_match.group(1))
header_text = header_match.group(2)
processed_lines.append(f'<h{level}>{header_text}</h{level}>')
i += 1
continue
# Blockquotes (>)
if stripped.startswith('>'):
quote_text = stripped[1:].strip()
processed_lines.append(f'<blockquote>{quote_text}</blockquote>')
i += 1
continue
# Lists - be careful not to match "- label:" patterns
# Check for unordered list (- or *)
list_match = re.match(r'^(\s*)([-*])\s+(.+)$', line)
if list_match:
list_content = list_match.group(3)
# Don't treat as list if it looks like a label pattern:
# - Ends with just a colon (with optional whitespace), OR
# - Matches pattern like "Word:" or "Words:" followed by tabs/spaces (typical label format)
is_label = (re.search(r':\s*$', list_content) or
re.match(r'^[A-Z][a-zA-Z\s]+:\s+\S', list_content))
if not is_label:
if not in_list or list_type != 'ul':
if in_list:
# Close previous list
if list_type == 'ol':
processed_lines.append('<ol>' + ''.join(list_items) + '</ol>')
list_items = []
in_list = True
list_type = 'ul'
list_items.append(f'<li>{list_content}</li>')
i += 1
continue
# Check for ordered list (1. 2. etc.)
ordered_match = re.match(r'^(\s*)(\d+)\.\s+(.+)$', line)
if ordered_match:
list_content = ordered_match.group(3)
if not in_list or list_type != 'ol':
if in_list:
# Close previous list
if list_type == 'ul':
processed_lines.append('<ul>' + ''.join(list_items) + '</ul>')
list_items = []
in_list = True
list_type = 'ol'
list_items.append(f'<li>{list_content}</li>')
i += 1
continue
# Not a list item, so close any open list
if in_list:
if list_type == 'ul':
processed_lines.append('<ul>' + ''.join(list_items) + '</ul>')
else:
processed_lines.append('<ol>' + ''.join(list_items) + '</ol>')
in_list = False
list_type = None
list_items = []
# Regular line
processed_lines.append(line)
i += 1
# Close any remaining open list
if in_list:
if list_type == 'ul':
processed_lines.append('<ul>' + ''.join(list_items) + '</ul>')
else:
processed_lines.append('<ol>' + ''.join(list_items) + '</ol>')
html_text = '\n'.join(processed_lines)
# Process inline elements (but skip code blocks)
# First, restore and process protected markdown patterns
for idx, original_pattern in enumerate(markdown_pattern_placeholders):
placeholder = f"__MARKDOWN_PATTERN_{idx}__"
escaped_placeholder = html.escape(placeholder)
# Check both escaped and unescaped placeholders
if placeholder in html_text or escaped_placeholder in html_text:
# Process the original pattern (unescaped)
# Image links
image_link_match = re.match(r'\[!\[([^\]]*)\]\(([^\)]+)\)\]\(([^\)]+)\)', original_pattern)
if image_link_match:
alt_text = image_link_match.group(1)
image_url = image_link_match.group(2)
link_url = image_link_match.group(3)
if repo_url:
if image_url and not image_url.startswith(('http://', 'https://', 'data:', '#')):
if image_url.startswith('/'):
image_url = repo_url.rstrip('/') + '/blob/HEAD' + image_url
else:
image_url = repo_url.rstrip('/') + '/blob/HEAD/' + image_url
if link_url and not link_url.startswith(('http://', 'https://', 'mailto:', '#')):
if link_url.startswith('/'):
link_url = repo_url.rstrip('/') + '/blob/HEAD' + link_url
else:
link_url = repo_url.rstrip('/') + '/blob/HEAD/' + link_url
replacement = f'<a href="{html.escape(link_url)}"><img src="{html.escape(image_url)}" alt="{html.escape(alt_text)}"></a>'
html_text = html_text.replace(placeholder, replacement)
html_text = html_text.replace(escaped_placeholder, replacement)
continue
# Images
image_match = re.match(r'!\[([^\]]*)\]\(([^\)]+)\)', original_pattern)
if image_match:
alt_text = image_match.group(1)
image_url = image_match.group(2)
if repo_url and image_url and not image_url.startswith(('http://', 'https://', 'data:', '#')):
if image_url.startswith('/'):
absolute_url = repo_url.rstrip('/') + '/blob/HEAD' + image_url
else:
absolute_url = repo_url.rstrip('/') + '/blob/HEAD/' + image_url
replacement = f'<img src="{html.escape(absolute_url)}" alt="{html.escape(alt_text)}">'
else:
replacement = f'<img src="{html.escape(image_url)}" alt="{html.escape(alt_text)}">'
html_text = html_text.replace(placeholder, replacement)
html_text = html_text.replace(escaped_placeholder, replacement)
continue
# Links
link_match = re.match(r'\[([^\]]+)\]\(([^\)]+)\)', original_pattern)
if link_match:
link_text = link_match.group(1)
link_url = link_match.group(2)
if repo_url and link_url and not link_url.startswith(('http://', 'https://', 'mailto:', '#')):
if link_url.startswith('/'):
absolute_url = repo_url.rstrip('/') + '/blob/HEAD' + link_url
else:
absolute_url = repo_url.rstrip('/') + '/blob/HEAD/' + link_url
replacement = f'<a href="{html.escape(absolute_url)}">{html.escape(link_text)}</a>'
else:
replacement = f'<a href="{html.escape(link_url)}">{html.escape(link_text)}</a>'
html_text = html_text.replace(placeholder, replacement)
html_text = html_text.replace(escaped_placeholder, replacement)
continue
# Process remaining markdown patterns that weren't protected (shouldn't happen, but for safety)
image_link_pattern = r'\[!\[([^\]]*)\]\(([^\)]+)\)\]\(([^\)]+)\)'
def convert_image_link(match):
alt_text = html.unescape(match.group(1))
image_url = html.unescape(match.group(2))
link_url = html.unescape(match.group(3))
if repo_url:
if image_url and not image_url.startswith(('http://', 'https://', 'data:', '#')):
if image_url.startswith('/'):
image_url = repo_url.rstrip('/') + '/blob/HEAD' + image_url
else:
image_url = repo_url.rstrip('/') + '/blob/HEAD/' + image_url
if link_url and not link_url.startswith(('http://', 'https://', 'mailto:', '#')):
if link_url.startswith('/'):
link_url = repo_url.rstrip('/') + '/blob/HEAD' + link_url
else:
link_url = repo_url.rstrip('/') + '/blob/HEAD/' + link_url
return f'<a href="{html.escape(link_url)}"><img src="{html.escape(image_url)}" alt="{html.escape(alt_text)}"></a>'
html_text = re.sub(image_link_pattern, convert_image_link, html_text)
# Images
image_pattern = r'!\[([^\]]*)\]\(([^\)]+)\)'
def convert_image(match):
alt_text = html.unescape(match.group(1))
image_url = html.unescape(match.group(2))
# Convert relative image URLs to absolute if repo_url is provided
if repo_url and image_url and not image_url.startswith(('http://', 'https://', 'data:', '#')):
# Use blob/HEAD/ for relative image URLs (GitHub requires branch in path)
if image_url.startswith('/'):
absolute_url = repo_url.rstrip('/') + '/blob/HEAD' + image_url
else:
absolute_url = repo_url.rstrip('/') + '/blob/HEAD/' + image_url
return f'<img src="{html.escape(absolute_url)}" alt="{html.escape(alt_text)}">'
return f'<img src="{html.escape(image_url)}" alt="{html.escape(alt_text)}">'
html_text = re.sub(image_pattern, convert_image, html_text)
# Links
link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
def convert_link(match):
link_text = html.unescape(match.group(1))
link_url = html.unescape(match.group(2))
# Convert relative links to absolute if repo_url is provided
if repo_url and link_url and not link_url.startswith(('http://', 'https://', 'mailto:', '#')):
# Use blob/HEAD/ for relative links (GitHub requires branch in path)
if link_url.startswith('/'):
absolute_url = repo_url.rstrip('/') + '/blob/HEAD' + link_url
else:
absolute_url = repo_url.rstrip('/') + '/blob/HEAD/' + link_url
return f'<a href="{html.escape(absolute_url)}">{link_text}</a>'
return f'<a href="{html.escape(link_url)}">{link_text}</a>'
html_text = re.sub(link_pattern, convert_link, html_text)
# Strikethrough
html_text = re.sub(r'~~([^~]+)~~', r'<s>\1</s>', html_text)
# Bold
html_text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', html_text)
html_text = re.sub(r'__([^_]+)__', r'<b>\1</b>', html_text)