From eb5621fb29513eb7e777a5318f8a508c916fa07c Mon Sep 17 00:00:00 2001 From: solfe Date: Thu, 26 Mar 2026 22:05:31 +0900 Subject: [PATCH] Fix: remove flaky external crawling tests --- .../ContentCrawlabilityFilterTest.java | 84 +------ .../feed/filter/filters/MediumCrawlTest.java | 101 -------- .../service/AdvancedCoverImageDslTest.java | 232 ------------------ .../feed/service/MediumCoverImageTest.java | 173 ------------- 4 files changed, 1 insertion(+), 589 deletions(-) delete mode 100644 src/test/java/com/linglevel/api/content/feed/filter/filters/MediumCrawlTest.java delete mode 100644 src/test/java/com/linglevel/api/content/feed/service/AdvancedCoverImageDslTest.java delete mode 100644 src/test/java/com/linglevel/api/content/feed/service/MediumCoverImageTest.java diff --git a/src/test/java/com/linglevel/api/content/feed/filter/filters/ContentCrawlabilityFilterTest.java b/src/test/java/com/linglevel/api/content/feed/filter/filters/ContentCrawlabilityFilterTest.java index 3551190..652d427 100644 --- a/src/test/java/com/linglevel/api/content/feed/filter/filters/ContentCrawlabilityFilterTest.java +++ b/src/test/java/com/linglevel/api/content/feed/filter/filters/ContentCrawlabilityFilterTest.java @@ -6,8 +6,6 @@ import com.linglevel.api.crawling.repository.CrawlingDslRepository; import com.linglevel.api.crawling.service.CrawlingService; import com.rometools.rome.feed.synd.SyndEntry; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -37,86 +35,6 @@ void setUp() { filter = new ContentCrawlabilityFilter(crawlingDslRepository, crawlingService); } - @Test - @DisplayName("실제 Medium 글에서 콘텐츠 크롤링 성공 - 100자 이상") - void testMediumArticleCrawlability() { - // given: Medium 실제 URL - String mediumUrl = "https://medium.com/@haleemakashimri111/32-7-trending-javascript-libraries-fe427f00a3f4"; - - // Medium DSL 설정 (올바른 문법) - CrawlingDsl mediumDsl = CrawlingDsl.builder() - .domain("medium.com") - .contentDsl("D'''article p.pw-post-body-paragraph'''>#") - .build(); - - when(crawlingService.extractDomain(mediumUrl)).thenReturn("medium.com"); - when(crawlingDslRepository.findByDomain("medium.com")) - .thenReturn(Optional.of(mediumDsl)); - - // Mock SyndEntry - SyndEntry entry = mock(SyndEntry.class); - when(entry.getLink()).thenReturn(mediumUrl); - - FeedSource feedSource = mock(FeedSource.class); - - // when: 필터 실행 - FeedFilterResult result = filter.filter(entry, feedSource); - - // then: 통과해야 함 (100자 이상 추출됨) - assertTrue(result.isPassed(), "Medium 글은 100자 이상 콘텐츠가 추출되어야 함"); - - System.out.println("==========================================="); - System.out.println("=== Medium Article Crawlability Test ==="); - System.out.println("==========================================="); - System.out.println("URL: " + mediumUrl); - System.out.println("DSL: " + mediumDsl.getContentDsl()); - System.out.println("Result: " + (result.isPassed() ? "✅ PASS" : "❌ FAIL")); - if (!result.isPassed()) { - System.out.println("Reason: " + result.getReason()); - } - System.out.println("==========================================="); - } - - @Test - @DisplayName("실제 Medium 글 크롤링 상세 검증") - void testMediumArticleDetailedCrawling() throws Exception { - // given: Medium URL과 DSL (올바른 문법) - String mediumUrl = "https://medium.com/@haleemakashimri111/32-7-trending-javascript-libraries-fe427f00a3f4"; - String mediumDsl = "D'''article p.pw-post-body-paragraph'''>#"; - - // when: 직접 크롤링 - System.out.println("==========================================="); - System.out.println("=== Medium Article Detailed Crawling ==="); - System.out.println("==========================================="); - System.out.println("URL: " + mediumUrl); - System.out.println("DSL: " + mediumDsl); - System.out.println(); - - Document doc = Jsoup.connect(mediumUrl) - .timeout(10000) - .userAgent("Mozilla/5.0") - .get(); - - com.linglevel.api.crawling.dsl.CrawlerDsl crawler = new com.linglevel.api.crawling.dsl.CrawlerDsl(doc); - String extractedContent = crawler.executeAsString(mediumDsl); - - // then: 검증 - assertNotNull(extractedContent, "추출된 콘텐츠는 null이 아니어야 함"); - assertFalse(extractedContent.trim().isEmpty(), "추출된 콘텐츠는 비어있지 않아야 함"); - assertTrue(extractedContent.trim().length() >= 100, - "추출된 콘텐츠는 100자 이상이어야 함 (실제: " + extractedContent.trim().length() + "자)"); - - System.out.println("Extracted Content Length: " + extractedContent.trim().length() + " chars"); - System.out.println(); - System.out.println("Extracted Content Preview (first 500 chars):"); - System.out.println("-------------------------------------------"); - System.out.println(extractedContent.substring(0, Math.min(500, extractedContent.length()))); - System.out.println("-------------------------------------------"); - System.out.println(); - System.out.println("✅ Test Result: PASS (100자 이상 추출 성공)"); - System.out.println("==========================================="); - } - @Test @DisplayName("CrawlingDsl이 없는 도메인은 불통과") void testNoCrawlingDsl() { @@ -242,4 +160,4 @@ void testFilterOrder() { // then assertEquals(100, order, "HTTP 요청이 필요하므로 가장 나중에 실행되어야 함"); } -} \ No newline at end of file +} diff --git a/src/test/java/com/linglevel/api/content/feed/filter/filters/MediumCrawlTest.java b/src/test/java/com/linglevel/api/content/feed/filter/filters/MediumCrawlTest.java deleted file mode 100644 index f25b939..0000000 --- a/src/test/java/com/linglevel/api/content/feed/filter/filters/MediumCrawlTest.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.linglevel.api.content.feed.filter.filters; - -import com.linglevel.api.crawling.dsl.CrawlerDsl; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -@DisplayName("Medium 실제 크롤링 테스트") -class MediumCrawlTest { - - @Test - @DisplayName("Medium 글에서 실제 콘텐츠 추출") - void crawlMediumArticle() throws Exception { - // given - String url = "https://medium.com/@haleemakashimri111/32-7-trending-javascript-libraries-fe427f00a3f4"; - String dsl = "D'''article p.pw-post-body-paragraph'''>#"; - - System.out.println("=".repeat(80)); - System.out.println("Medium 글 크롤링 테스트"); - System.out.println("=".repeat(80)); - System.out.println("URL: " + url); - System.out.println("DSL: " + dsl); - System.out.println(); - - // when: 페이지 가져오기 - Document doc = Jsoup.connect(url) - .timeout(10000) - .userAgent("Mozilla/5.0") - .get(); - - CrawlerDsl crawler = new CrawlerDsl(doc); - String content = crawler.executeAsString(dsl); - - // then: 검증 - System.out.println("크롤링 결과:"); - System.out.println("-".repeat(80)); - - if (content != null) { - System.out.println("총 길이: " + content.length() + "자"); - System.out.println(); - System.out.println("내용 (전체):"); - System.out.println(content); - } else { - System.out.println("❌ 콘텐츠를 추출하지 못했습니다."); - } - - System.out.println("-".repeat(80)); - System.out.println(); - - assertNotNull(content, "콘텐츠는 null이 아니어야 함"); - assertFalse(content.trim().isEmpty(), "콘텐츠는 비어있지 않아야 함"); - assertTrue(content.length() >= 100, - "콘텐츠는 100자 이상이어야 함 (실제: " + content.length() + "자)"); - - System.out.println("✅ 테스트 통과!"); - System.out.println(" - 콘텐츠 길이: " + content.length() + "자"); - System.out.println(" - 최소 요구사항: 100자"); - System.out.println("=".repeat(80)); - } - - @Test - @DisplayName("Medium 글 HTML 구조 분석") - void analyzeMediumStructure() throws Exception { - // given - String url = "https://medium.com/@haleemakashimri111/32-7-trending-javascript-libraries-fe427f00a3f4"; - - System.out.println("=".repeat(80)); - System.out.println("Medium HTML 구조 분석"); - System.out.println("=".repeat(80)); - System.out.println("URL: " + url); - System.out.println(); - - // when - Document doc = Jsoup.connect(url) - .timeout(10000) - .userAgent("Mozilla/5.0") - .get(); - - System.out.println("1. article 태그 개수: " + doc.select("article").size()); - System.out.println("2. p.pw-post-body-paragraph 개수: " + - doc.select("p.pw-post-body-paragraph").size()); - System.out.println("3. article p.pw-post-body-paragraph 개수: " + - doc.select("article p.pw-post-body-paragraph").size()); - System.out.println(); - - System.out.println("첫 3개 문단:"); - System.out.println("-".repeat(80)); - - var paragraphs = doc.select("article p.pw-post-body-paragraph"); - for (int i = 0; i < Math.min(3, paragraphs.size()); i++) { - String text = paragraphs.get(i).text(); - System.out.println((i + 1) + ". " + text); - System.out.println(); - } - - System.out.println("=".repeat(80)); - } -} \ No newline at end of file diff --git a/src/test/java/com/linglevel/api/content/feed/service/AdvancedCoverImageDslTest.java b/src/test/java/com/linglevel/api/content/feed/service/AdvancedCoverImageDslTest.java deleted file mode 100644 index 7e64d6c..0000000 --- a/src/test/java/com/linglevel/api/content/feed/service/AdvancedCoverImageDslTest.java +++ /dev/null @@ -1,232 +0,0 @@ -package com.linglevel.api.content.feed.service; - -import com.linglevel.api.crawling.dsl.CrawlerDsl; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -@DisplayName("고급 커버 이미지 DSL 테스트") -class AdvancedCoverImageDslTest { - - @Test - @DisplayName("정교한 fallback 체인 DSL로 커버 이미지 추출") - void extractCoverImageWithAdvancedDsl() throws Exception { - // given: 고급 커버 이미지 DSL (여러 fallback) - String coverImageDsl = "D'meta[property=\"og:image\"]'@'content' ? " + - "D'article figure img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article picture img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article .hero-image img, article .main-image img, article .featured-image img'@'src'"; - - // given: 다양한 article URLs (Medium, BBC, TechCrunch 등) - String[] testUrls = { - "https://python.plainenglish.io/the-ai-workflow-that-runs-my-business-while-i-sleep-def4424d29cd", - "https://medium.com/codetodeploy/how-i-used-machine-learning-to-predict-my-business-metrics-89cca3c59ec6", - "https://techcrunch.com/2024/11/05/anthropics-new-ai-can-control-your-pc/" - }; - - System.out.println("==========================================="); - System.out.println("=== 고급 Fallback 체인 DSL 테스트 ==="); - System.out.println("==========================================="); - System.out.println("DSL:"); - System.out.println("1. og:image 메타 태그"); - System.out.println("2. article figure img (로고/광고 제외)"); - System.out.println("3. article picture img (로고/광고 제외)"); - System.out.println("4. article hero/main/featured 이미지"); - System.out.println("===========================================\n"); - - for (String url : testUrls) { - try { - System.out.println("--- " + url + " ---"); - - Document doc = Jsoup.connect(url) - .timeout(15000) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") - .get(); - - CrawlerDsl crawler = new CrawlerDsl(doc); - String coverImage = crawler.executeAsString(coverImageDsl); - - if (coverImage != null && !coverImage.isEmpty()) { - System.out.println("✓ 커버 이미지 추출 성공"); - System.out.println(" URL: " + coverImage); - - // 어느 fallback에서 추출되었는지 확인 - String ogImage = crawler.executeAsString("D'meta[property=\"og:image\"]'@'content'"); - if (ogImage != null && ogImage.equals(coverImage)) { - System.out.println(" Method: og:image (1st fallback)"); - } else { - System.out.println(" Method: 다른 fallback 사용됨"); - } - } else { - System.out.println("✗ 커버 이미지 추출 실패"); - } - - System.out.println(); - - } catch (Exception e) { - System.out.println("✗ 에러: " + e.getMessage()); - System.out.println(); - } - } - - System.out.println("==========================================="); - } - - @Test - @DisplayName("각 fallback 단계별 테스트") - void testEachFallbackStep() throws Exception { - // given: Medium article - String url = "https://python.plainenglish.io/the-ai-workflow-that-runs-my-business-while-i-sleep-def4424d29cd"; - - System.out.println("==========================================="); - System.out.println("=== Fallback 단계별 분석 ==="); - System.out.println("==========================================="); - System.out.println("URL: " + url); - System.out.println(); - - Document doc = Jsoup.connect(url) - .timeout(15000) - .userAgent("Mozilla/5.0") - .get(); - - CrawlerDsl crawler = new CrawlerDsl(doc); - - // Fallback 1: og:image - System.out.println("1. og:image 메타 태그:"); - String ogImage = crawler.executeAsString("D'meta[property=\"og:image\"]'@'content'"); - System.out.println(" " + (ogImage != null ? "✓ " + ogImage : "✗ 없음")); - System.out.println(); - - // Fallback 2: article figure img - System.out.println("2. article figure img (필터링):"); - String figureImg = crawler.executeAsString( - "D'article figure img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src'" - ); - System.out.println(" " + (figureImg != null ? "✓ " + figureImg : "✗ 없음")); - System.out.println(); - - // Fallback 3: article picture img - System.out.println("3. article picture img (필터링):"); - String pictureImg = crawler.executeAsString( - "D'article picture img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src'" - ); - System.out.println(" " + (pictureImg != null ? "✓ " + pictureImg : "✗ 없음")); - System.out.println(); - - // Fallback 4: hero/main/featured images - System.out.println("4. hero/main/featured 이미지:"); - String heroImg = crawler.executeAsString("D'article .hero-image img, article .main-image img, article .featured-image img'@'src'"); - System.out.println(" " + (heroImg != null ? "✓ " + heroImg : "✗ 없음")); - System.out.println(); - - // 최종 결과 (전체 체인) - System.out.println("==========================================="); - System.out.println("최종 결과 (전체 fallback 체인):"); - String finalResult = crawler.executeAsString( - "D'meta[property=\"og:image\"]'@'content' ? " + - "D'article figure img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article picture img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article .hero-image img, article .main-image img, article .featured-image img'@'src'" - ); - System.out.println(finalResult != null ? "✓ " + finalResult : "✗ 실패"); - System.out.println("==========================================="); - - assertNotNull(finalResult, "커버 이미지를 찾을 수 없습니다"); - } - - @Test - @DisplayName("다양한 뉴스 사이트에서 커버 이미지 추출") - void extractFromVariousNewsSites() throws Exception { - String coverImageDsl = "D'meta[property=\"og:image\"]'@'content' ? " + - "D'article figure img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article picture img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article .hero-image img, article .main-image img, article .featured-image img'@'src'"; - - String[][] testSites = { - {"Medium", "https://python.plainenglish.io/the-ai-workflow-that-runs-my-business-while-i-sleep-def4424d29cd"}, - {"TechCrunch", "https://techcrunch.com/2024/11/05/anthropics-new-ai-can-control-your-pc/"}, - {"The Verge", "https://www.theverge.com/2024/11/5/24288788/anthropic-ai-model-computer-use-control-pc"} - }; - - System.out.println("==========================================="); - System.out.println("=== 다양한 뉴스 사이트 테스트 ==="); - System.out.println("===========================================\n"); - - int successCount = 0; - int totalCount = testSites.length; - - for (String[] site : testSites) { - String siteName = site[0]; - String url = site[1]; - - try { - System.out.println("--- " + siteName + " ---"); - System.out.println("URL: " + url); - - Document doc = Jsoup.connect(url) - .timeout(15000) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") - .get(); - - CrawlerDsl crawler = new CrawlerDsl(doc); - String coverImage = crawler.executeAsString(coverImageDsl); - - if (coverImage != null && !coverImage.isEmpty()) { - System.out.println("✓ 성공"); - System.out.println("Image: " + (coverImage.length() > 80 ? coverImage.substring(0, 80) + "..." : coverImage)); - successCount++; - } else { - System.out.println("✗ 실패"); - } - - } catch (Exception e) { - System.out.println("✗ 에러: " + e.getMessage()); - } - - System.out.println(); - } - - System.out.println("==========================================="); - System.out.println("결과: " + successCount + "/" + totalCount + " 성공"); - System.out.println("성공률: " + String.format("%.1f%%", (successCount * 100.0 / totalCount))); - System.out.println("==========================================="); - } - - @Test - @DisplayName("권장 DSL 출력") - void printRecommendedDsl() { - System.out.println("==========================================="); - System.out.println("=== FeedSource에 사용할 권장 DSL ==="); - System.out.println("===========================================\n"); - - String recommendedDsl = "D'meta[property=\"og:image\"]'@'content' ? " + - "D'article figure img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article picture img:not([class=\"logo\"]):not([id=\"logo\"]):not([class=\"brand\"]):not([id=\"brand\"]):not([class=\"icon\"]):not([src=\".svg\"]):not([class=\"ad\"]):not([id=\"ad\"]):not([class=\"advert\"]):not([id=\"advert\"]):not([class=\"avatar\"]):not([class=\"profile\"])'@'src' ? " + - "D'article .hero-image img, article .main-image img, article .featured-image img'@'src'"; - - System.out.println("coverImageDsl:"); - System.out.println(recommendedDsl); - System.out.println(); - - System.out.println("설명:"); - System.out.println("1. og:image 메타 태그 (가장 안정적)"); - System.out.println("2. article figure img (로고, 광고, 아바타 제외)"); - System.out.println("3. article picture img (로고, 광고, 아바타 제외)"); - System.out.println("4. article의 hero/main/featured 이미지"); - System.out.println(); - - System.out.println("API 요청 예시:"); - System.out.println("{"); - System.out.println(" \"url\": \"https://medium.com/feed/tag/programming\","); - System.out.println(" \"name\": \"Medium Programming\","); - System.out.println(" \"coverImageDsl\": \"" + recommendedDsl.replace("\"", "\\\"") + "\","); - System.out.println(" \"contentType\": \"NEWS\","); - System.out.println(" \"category\": \"TECH\""); - System.out.println("}"); - - System.out.println("\n==========================================="); - } -} diff --git a/src/test/java/com/linglevel/api/content/feed/service/MediumCoverImageTest.java b/src/test/java/com/linglevel/api/content/feed/service/MediumCoverImageTest.java deleted file mode 100644 index 52009a7..0000000 --- a/src/test/java/com/linglevel/api/content/feed/service/MediumCoverImageTest.java +++ /dev/null @@ -1,173 +0,0 @@ -package com.linglevel.api.content.feed.service; - -import com.linglevel.api.crawling.dsl.CrawlerDsl; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -@DisplayName("Medium 커버 이미지 추출 테스트") -class MediumCoverImageTest { - - @Test - @DisplayName("Medium article에서 제공된 DSL로 커버 이미지 추출") - void extractCoverImageFromMediumArticle() throws Exception { - // given: Medium article URL (테스트에서 가져온 실제 URL) - String articleUrl = "https://python.plainenglish.io/the-ai-workflow-that-runs-my-business-while-i-sleep-def4424d29cd"; - - // given: 제공된 커버 이미지 DSL - String coverImageDsl = "D'''article [data-component=\"article-body\"], article [data-testid=\"article-body\"], article [itemprop=\"articleBody\"], article .article-body__content, article .article-body, article .body__content''' > '''p:not([class*=\"caption\"]):not([class*=\"credit\"]):not([class*=\"byline\"]):not([class*=\"author\"]):not([class*=\"date\"]):not([class*=\"timestamp\"]):not([class*=\"dateline\"]):not([class*=\"note\"])''' > #^"; - - System.out.println("==========================================="); - System.out.println("=== Medium Article 분석 ==="); - System.out.println("==========================================="); - System.out.println("URL: " + articleUrl); - System.out.println(); - - // when: Article HTML 가져오기 - Document doc = Jsoup.connect(articleUrl) - .timeout(15000) - .userAgent("Mozilla/5.0") - .get(); - - System.out.println("HTML 페이지 로드 완료"); - System.out.println(); - - // DSL로 커버 이미지 추출 시도 - CrawlerDsl crawler = new CrawlerDsl(doc); - String result = crawler.executeAsString(coverImageDsl); - - System.out.println("=== DSL 실행 결과 ==="); - if (result != null && !result.trim().isEmpty()) { - System.out.println("추출된 내용 (첫 500자):"); - System.out.println(result.substring(0, Math.min(500, result.length()))); - if (result.length() > 500) { - System.out.println("... (총 " + result.length() + "자)"); - } - } else { - System.out.println("추출 실패 또는 비어있음"); - } - System.out.println(); - - // 다른 방법들도 시도 - System.out.println("=== 대체 방법 시도 ==="); - - // 1. og:image 메타 태그 - String ogImageDsl = "D'meta[property=\"og:image\"]'@'content'"; - String ogImage = crawler.executeAsString(ogImageDsl); - System.out.println("og:image: " + ogImage); - - // 2. twitter:image 메타 태그 - String twitterImageDsl = "D'meta[name=\"twitter:image\"]'@'content'"; - String twitterImage = crawler.executeAsString(twitterImageDsl); - System.out.println("twitter:image: " + twitterImage); - - // 3. article 내 첫 번째 img - String firstImgDsl = "D'article img'@'src'"; - String firstImg = crawler.executeAsString(firstImgDsl); - System.out.println("첫 번째 img src: " + firstImg); - - // 4. picture 태그 - String pictureDsl = "D'article picture img'@'src'"; - String pictureImg = crawler.executeAsString(pictureDsl); - System.out.println("picture img: " + pictureImg); - - System.out.println(); - System.out.println("==========================================="); - - // 가장 좋은 이미지 선택 - String bestImage = null; - if (ogImage != null && !ogImage.isEmpty()) { - bestImage = ogImage; - System.out.println("선택된 이미지: og:image"); - } else if (twitterImage != null && !twitterImage.isEmpty()) { - bestImage = twitterImage; - System.out.println("선택된 이미지: twitter:image"); - } else if (firstImg != null && !firstImg.isEmpty()) { - bestImage = firstImg; - System.out.println("선택된 이미지: 첫 번째 img"); - } else if (pictureImg != null && !pictureImg.isEmpty()) { - bestImage = pictureImg; - System.out.println("선택된 이미지: picture img"); - } - - System.out.println("최종 커버 이미지 URL: " + bestImage); - System.out.println("==========================================="); - - assertNotNull(bestImage, "커버 이미지를 찾을 수 없습니다"); - } - - @Test - @DisplayName("Medium 여러 article에서 커버 이미지 추출 테스트") - void extractCoverImagesFromMultipleArticles() throws Exception { - // given: 여러 Medium article URLs - String[] articleUrls = { - "https://python.plainenglish.io/the-ai-workflow-that-runs-my-business-while-i-sleep-def4424d29cd", - "https://medium.com/codetodeploy/how-i-used-machine-learning-to-predict-my-business-metrics-89cca3c59ec6", - "https://ai.gopubby.com/the-ai-automation-that-cut-my-operating-costs-by-40-5c8d715e9f17" - }; - - System.out.println("==========================================="); - System.out.println("=== 여러 Medium Article 커버 이미지 추출 ==="); - System.out.println("===========================================\n"); - - for (String url : articleUrls) { - try { - System.out.println("--- " + url + " ---"); - - Document doc = Jsoup.connect(url) - .timeout(15000) - .userAgent("Mozilla/5.0") - .get(); - - CrawlerDsl crawler = new CrawlerDsl(doc); - - // og:image 시도 - String ogImageDsl = "D'meta[property=\"og:image\"]'@'content'"; - String ogImage = crawler.executeAsString(ogImageDsl); - - if (ogImage != null && !ogImage.isEmpty()) { - System.out.println("✓ 커버 이미지 발견: " + ogImage); - } else { - System.out.println("✗ 커버 이미지 없음"); - } - - System.out.println(); - - } catch (Exception e) { - System.out.println("✗ 실패: " + e.getMessage()); - System.out.println(); - } - } - - System.out.println("==========================================="); - } - - @Test - @DisplayName("Medium에서 권장되는 커버 이미지 DSL") - void recommendedCoverImageDsl() { - System.out.println("==========================================="); - System.out.println("=== Medium 커버 이미지 추출 권장 DSL ==="); - System.out.println("===========================================\n"); - - System.out.println("1. 가장 안정적 (og:image 메타 태그):"); - System.out.println(" D'meta[property=\"og:image\"]'@'content'"); - System.out.println(); - - System.out.println("2. Fallback 1 (twitter:image):"); - System.out.println(" D'meta[name=\"twitter:image\"]'@'content'"); - System.out.println(); - - System.out.println("3. Fallback 2 (article 첫 이미지):"); - System.out.println(" D'article img'@'src'"); - System.out.println(); - - System.out.println("4. 조합 DSL (null coalescing):"); - System.out.println(" D'meta[property=\"og:image\"]'@'content' ? D'meta[name=\"twitter:image\"]'@'content' ? D'article img'@'src'"); - System.out.println(); - - System.out.println("==========================================="); - } -}