File tree Expand file tree Collapse file tree 3 files changed +16
-8
lines changed
main/java/com/github/bottomlessarchive/warc/service
test/java/com/github/bottomlessarchive/warc/test Expand file tree Collapse file tree 3 files changed +16
-8
lines changed Original file line number Diff line number Diff line change @@ -24,6 +24,16 @@ public static <T extends WarcContentBlock> Stream<WarcRecord<T>> streamOf(@NotNu
2424 return WarcRecordStreamFactory .streamOf (url , EVERY_WARC_RECORD_TYPE );
2525 }
2626
27+ public static <T extends WarcContentBlock > Stream <WarcRecord <T >> streamOf (@ NotNull @ NonNull final URL url ,
28+ @ NotNull @ NonNull final WarcRecordType ... requiredRecordTypes ) {
29+ try {
30+ return streamOf (new AvailableInputStream (new BufferedInputStream (url .openStream ())),
31+ WarcReader .DEFAULT_CHARSET , true , List .of (requiredRecordTypes ));
32+ } catch (IOException e ) {
33+ throw new WarcNetworkException ("Unable to open WARC location: " + url + "!" , e );
34+ }
35+ }
36+
2737 public static <T extends WarcContentBlock > Stream <WarcRecord <T >> streamOf (@ NotNull @ NonNull final URL url ,
2838 @ NotNull @ NonNull final List <WarcRecordType > requiredRecordTypes ) {
2939 try {
Original file line number Diff line number Diff line change 66import com .github .bottomlessarchive .warc .service .content .response .domain .ResponseContentBlock ;
77import com .github .bottomlessarchive .warc .service .record .domain .WarcRecord ;
88
9- import java .io .File ;
109import java .io .FileInputStream ;
1110import java .util .Optional ;
1211
1312public class TestFileWarcReader {
1413
1514 public static void main (final String ... arg ) throws Exception {
1615 final WarcReader warcReader = new WarcReader (new FileInputStream (
17- new File ( "C:\\ warc-test\\ CC-MAIN-20180716232549-20180717012549-00001.warc.gz" ) ));
16+ "C:\\ warc-test\\ CC-MAIN-20180716232549-20180717012549-00001.warc.gz" ));
1817
1918 boolean hasNext = true ;
2019 while (hasNext ) {
@@ -23,8 +22,7 @@ public static void main(final String... arg) throws Exception {
2322
2423 optionalWarcRecord
2524 .filter (WarcRecord ::isResponse )
26- .map (warcRecord -> ((ResponseContentBlock ) warcRecord .getWarcContentBlock ())
27- .getPayloadAsString ())
25+ .map (warcRecord -> ((ResponseContentBlock ) warcRecord .getContentBlock ()).getPayloadAsString ())
2826 .ifPresent (System .out ::println );
2927
3028 hasNext = optionalWarcRecord .isPresent ();
Original file line number Diff line number Diff line change 22
33import com .github .bottomlessarchive .warc .service .WarcRecordStreamFactory ;
44import com .github .bottomlessarchive .warc .service .content .response .domain .ResponseContentBlock ;
5- import com .github .bottomlessarchive .warc .service .record .domain .WarcRecord ;
5+ import com .github .bottomlessarchive .warc .service .record .domain .WarcRecordType ;
6+
67import java .net .URL ;
78
89public class TestUrlWarcReader {
@@ -11,9 +12,8 @@ public static void main(final String... arg) throws Exception {
1112 final URL warcUrl = new URL (
1213 "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-43/segments/1539583508988.18/warc/CC-MAIN-20181015080248-20181015101748-00000.warc.gz" );
1314
14- WarcRecordStreamFactory .streamOf (warcUrl )
15- .filter (WarcRecord ::isResponse )
16- .map (entry -> ((ResponseContentBlock ) entry .getWarcContentBlock ()).getPayloadAsString ())
15+ WarcRecordStreamFactory .streamOf (warcUrl , WarcRecordType .RESPONSE )
16+ .map (entry -> ((ResponseContentBlock ) entry .getContentBlock ()).getPayloadAsString ())
1717 .forEach (System .out ::println );
1818 }
1919}
You can’t perform that action at this time.
0 commit comments