1+ {
2+ "cells" : [
3+ {
4+ "cell_type" : " markdown" ,
5+ "metadata" : {
6+ "collapsed" : true ,
7+ "pycharm" : {
8+ "name" : " #%% md\n "
9+ }
10+ },
11+ "source" : [
12+ " # 0. Download Corpora\n " ,
13+ " \n " ,
14+ " *Version: 2022-04-17*\n " ,
15+ " \n " ,
16+ " ---\n " ,
17+ " \n " ,
18+ " **Imports**"
19+ ]
20+ },
21+ {
22+ "cell_type" : " code" ,
23+ "execution_count" : 1 ,
24+ "outputs" : [],
25+ "source" : [
26+ " # standard library\n " ,
27+ " from typing import Set\n " ,
28+ " \n " ,
29+ " # LexNLP\n " ,
30+ " from lexnlp.ml.catalog.download import download_github_release"
31+ ],
32+ "metadata" : {
33+ "collapsed" : false ,
34+ "pycharm" : {
35+ "name" : " #%%\n "
36+ }
37+ }
38+ },
39+ {
40+ "cell_type" : " code" ,
41+ "execution_count" : 2 ,
42+ "outputs" : [],
43+ "source" : [
44+ " TAGS: Set[str] = {\n " ,
45+ " \n " ,
46+ " # labeled contract types\n " ,
47+ " 'corpus/contract-types/0.1',\n " ,
48+ " \n " ,
49+ " # Atticus CUAD v1 contracts\n " ,
50+ " 'corpus/atticus-cuad-v1-plaintext/0.1',\n " ,
51+ " }"
52+ ],
53+ "metadata" : {
54+ "collapsed" : false ,
55+ "pycharm" : {
56+ "name" : " #%%\n "
57+ }
58+ }
59+ },
60+ {
61+ "cell_type" : " markdown" ,
62+ "source" : [
63+ " Note: the above corpora will be downloaded from GitHub and stored on disk! Some may be large. Set `prompt_user=True` in order to validate file sizes."
64+ ],
65+ "metadata" : {
66+ "collapsed" : false ,
67+ "pycharm" : {
68+ "name" : " #%% md\n "
69+ }
70+ }
71+ },
72+ {
73+ "cell_type" : " code" ,
74+ "execution_count" : 3 ,
75+ "outputs" : [
76+ {
77+ "name" : " stdout" ,
78+ "output_type" : " stream" ,
79+ "text" : [
80+ " corpus/sec-edgar-forms-3-4-5-8k-10k-sample/0.1\n "
81+ ]
82+ },
83+ {
84+ "name" : " stderr" ,
85+ "output_type" : " stream" ,
86+ "text" : [
87+ " INFO:root:Downloading sec-edgar-forms-3-4-5-8k-10k-sample.tar.xz...\n " ,
88+ " 8.41MiB [00:00, 9.55MiB/s] \n " ,
89+ " INFO:root:...downloaded sec-edgar-forms-3-4-5-8k-10k-sample.tar.xz to /home/aparsons/lexpredict-contraxsuite-core/lexnlp/ml/catalog/data/corpus/sec-edgar-forms-3-4-5-8k-10k-sample/0.1\n " ,
90+ " INFO:root:Detected MD5; verifying (TBm93p9oJ1gVV5DJROwEOA==)...\n " ,
91+ " INFO:root:...verified.\n "
92+ ]
93+ }
94+ ],
95+ "source" : [
96+ " for tag in TAGS:\n " ,
97+ " print(tag)\n " ,
98+ " download_github_release(tag=tag, prompt_user=False)"
99+ ],
100+ "metadata" : {
101+ "collapsed" : false ,
102+ "pycharm" : {
103+ "name" : " #%%\n "
104+ }
105+ }
106+ },
107+ {
108+ "cell_type" : " code" ,
109+ "execution_count" : null ,
110+ "outputs" : [],
111+ "source" : [],
112+ "metadata" : {
113+ "collapsed" : false ,
114+ "pycharm" : {
115+ "name" : " #%%\n "
116+ }
117+ }
118+ }
119+ ],
120+ "metadata" : {
121+ "kernelspec" : {
122+ "display_name" : " Python 3" ,
123+ "language" : " python" ,
124+ "name" : " python3"
125+ },
126+ "language_info" : {
127+ "codemirror_mode" : {
128+ "name" : " ipython" ,
129+ "version" : 2
130+ },
131+ "file_extension" : " .py" ,
132+ "mimetype" : " text/x-python" ,
133+ "name" : " python" ,
134+ "nbconvert_exporter" : " python" ,
135+ "pygments_lexer" : " ipython2" ,
136+ "version" : " 2.7.6"
137+ }
138+ },
139+ "nbformat" : 4 ,
140+ "nbformat_minor" : 0
141+ }
0 commit comments