Skip to content

Commit 180ab05

Browse files
committed
added notebooks
1 parent d419a4f commit 180ab05

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+13549
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
data/samples/
66
libs/tika/
77
libs/stanford_nlp/
8-
notebooks/
8+
99
bin/
1010

1111
# Byte-compiled / optimized / DLL files
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"collapsed": true,
7+
"pycharm": {
8+
"name": "#%% md\n"
9+
}
10+
},
11+
"source": [
12+
"# 0. Download Corpora\n",
13+
"\n",
14+
"*Version: 2022-04-17*\n",
15+
"\n",
16+
"---\n",
17+
"\n",
18+
"**Imports**"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 1,
24+
"outputs": [],
25+
"source": [
26+
"# standard library\n",
27+
"from typing import Set\n",
28+
"\n",
29+
"# LexNLP\n",
30+
"from lexnlp.ml.catalog.download import download_github_release"
31+
],
32+
"metadata": {
33+
"collapsed": false,
34+
"pycharm": {
35+
"name": "#%%\n"
36+
}
37+
}
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": 2,
42+
"outputs": [],
43+
"source": [
44+
"TAGS: Set[str] = {\n",
45+
"\n",
46+
" # labeled contract types\n",
47+
" 'corpus/contract-types/0.1',\n",
48+
"\n",
49+
" # Atticus CUAD v1 contracts\n",
50+
" 'corpus/atticus-cuad-v1-plaintext/0.1',\n",
51+
"}"
52+
],
53+
"metadata": {
54+
"collapsed": false,
55+
"pycharm": {
56+
"name": "#%%\n"
57+
}
58+
}
59+
},
60+
{
61+
"cell_type": "markdown",
62+
"source": [
63+
"Note: the above corpora will be downloaded from GitHub and stored on disk! Some may be large. Set `prompt_user=True` in order to validate file sizes."
64+
],
65+
"metadata": {
66+
"collapsed": false,
67+
"pycharm": {
68+
"name": "#%% md\n"
69+
}
70+
}
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 3,
75+
"outputs": [
76+
{
77+
"name": "stdout",
78+
"output_type": "stream",
79+
"text": [
80+
"corpus/sec-edgar-forms-3-4-5-8k-10k-sample/0.1\n"
81+
]
82+
},
83+
{
84+
"name": "stderr",
85+
"output_type": "stream",
86+
"text": [
87+
"INFO:root:Downloading sec-edgar-forms-3-4-5-8k-10k-sample.tar.xz...\n",
88+
"8.41MiB [00:00, 9.55MiB/s] \n",
89+
"INFO:root:...downloaded sec-edgar-forms-3-4-5-8k-10k-sample.tar.xz to /home/aparsons/lexpredict-contraxsuite-core/lexnlp/ml/catalog/data/corpus/sec-edgar-forms-3-4-5-8k-10k-sample/0.1\n",
90+
"INFO:root:Detected MD5; verifying (TBm93p9oJ1gVV5DJROwEOA==)...\n",
91+
"INFO:root:...verified.\n"
92+
]
93+
}
94+
],
95+
"source": [
96+
"for tag in TAGS:\n",
97+
" print(tag)\n",
98+
" download_github_release(tag=tag, prompt_user=False)"
99+
],
100+
"metadata": {
101+
"collapsed": false,
102+
"pycharm": {
103+
"name": "#%%\n"
104+
}
105+
}
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": null,
110+
"outputs": [],
111+
"source": [],
112+
"metadata": {
113+
"collapsed": false,
114+
"pycharm": {
115+
"name": "#%%\n"
116+
}
117+
}
118+
}
119+
],
120+
"metadata": {
121+
"kernelspec": {
122+
"display_name": "Python 3",
123+
"language": "python",
124+
"name": "python3"
125+
},
126+
"language_info": {
127+
"codemirror_mode": {
128+
"name": "ipython",
129+
"version": 2
130+
},
131+
"file_extension": ".py",
132+
"mimetype": "text/x-python",
133+
"name": "python",
134+
"nbconvert_exporter": "python",
135+
"pygments_lexer": "ipython2",
136+
"version": "2.7.6"
137+
}
138+
},
139+
"nbformat": 4,
140+
"nbformat_minor": 0
141+
}

0 commit comments

Comments
 (0)