From de069db2ca0e8d5d7927a5c674a15d9fd6eb5d76 Mon Sep 17 00:00:00 2001 From: tomchccom Date: Sun, 17 May 2026 15:21:59 +0900 Subject: [PATCH 1/6] =?UTF-8?q?feat:=20=EC=9E=90=EC=86=8C=EC=84=9C=20?= =?UTF-8?q?=EA=B8=B0=EB=B0=98=20=EA=B2=BD=ED=97=98=20=EC=B6=94=EC=B6=9C=20?= =?UTF-8?q?API=20=EB=B0=8F=20=EC=84=9C=EB=B9=84=EC=8A=A4=20=EB=A1=9C?= =?UTF-8?q?=EC=A7=81=20=EC=B6=94=EA=B0=80=20(STAR=20=ED=8F=AC=EB=A7=B7=20?= =?UTF-8?q?=EC=B6=94=EC=B6=9C)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- myeongsung/.gitignore | 5 ++ myeongsung/app/api/router.py | 33 ++++++- myeongsung/app/schemas/resume_dto.py | 22 +++++ .../services/experience_extraction_service.py | 90 +++++++++++++++++++ .../scripts/test_experience_extraction.py | 47 ++++++++++ 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 myeongsung/app/services/experience_extraction_service.py create mode 100644 myeongsung/scripts/test_experience_extraction.py diff --git a/myeongsung/.gitignore b/myeongsung/.gitignore index 96b5a6d..97a3514 100644 --- a/myeongsung/.gitignore +++ b/myeongsung/.gitignore @@ -37,3 +37,8 @@ wheels/ # SQLite *.db *.sqlite3logs/ + +# Data and API specs +data/ +ai_job_extraction_api_spec.md +tests/results/ diff --git a/myeongsung/app/api/router.py b/myeongsung/app/api/router.py index 563d886..dd4f0c5 100644 --- a/myeongsung/app/api/router.py +++ b/myeongsung/app/api/router.py @@ -77,8 +77,39 @@ async def analyze_image(files: List[UploadFile] = File(...)): except Exception as e: raise HTTPException(status_code=500, detail=str(e)) -@router.post("/analyze-and-place", response_model=PlacementResponse) +from app.schemas.resume_dto import ExperienceExtractionResponse +from app.services.experience_extraction_service import extract_experiences_from_text, extract_experiences_from_url, extract_experiences_from_pdf + +@router.post("/extract-experiences", response_model=ExperienceExtractionResponse) +async def extract_experiences( + file: Optional[UploadFile] = File(None, description="자소서 원문 PDF 파일"), + url: Optional[str] = Form(None, description="자소서 웹페이지 URL"), + text: Optional[str] = Form(None, description="자소서 텍스트 원문") +): + """ + 자소서 원문(PDF, URL, 텍스트 중 하나)을 입력받아, 내재된 경험들을 STAR 포맷으로 구조화하여 추출합니다. + """ + if not file and not (url and url.strip()) and not (text and text.strip()): + raise HTTPException( + status_code=400, + detail="file (업로드 파일), url, text 중 최소 하나는 제공되어야 합니다." + ) + try: + if file and file.filename: + file_content = await file.read() + if file.filename.lower().endswith(".pdf"): + return extract_experiences_from_pdf(file_content) + else: + return extract_experiences_from_text(file_content.decode("utf-8")) + elif url and url.strip(): + return extract_experiences_from_url(url.strip()) + elif text and text.strip(): + return extract_experiences_from_text(text.strip()) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/analyze-and-place", response_model=PlacementResponse) async def analyze_and_place( background_tasks: BackgroundTasks, jd_pdf: Optional[UploadFile] = File(None, description="채용공고 원문 PDF 파일 (업스테이지 파싱용)"), diff --git a/myeongsung/app/schemas/resume_dto.py b/myeongsung/app/schemas/resume_dto.py index bfa0d0e..52bc2c9 100644 --- a/myeongsung/app/schemas/resume_dto.py +++ b/myeongsung/app/schemas/resume_dto.py @@ -32,3 +32,25 @@ class PlacementResult(BaseModel): class PlacementResponse(BaseModel): placements: List[PlacementResult] errors: List[str] = [] + +# ── 자소서 기반 경험 추출 스키마 ────────────────────────────────────── +class ExtractedExperience(BaseModel): + experience_name: str = Field(..., description="경험명 (예: 경식이 AI 전화 서비스 기획)") + experience_type: str = Field(..., description="경험 유형 (예: 프로젝트, 인턴, 동아리, 창업, 해커톤 등)") + organization: Optional[str] = Field(None, description="기관/소속") + period: Optional[str] = Field(None, description="기간") + my_role: str = Field(..., description="나의 역할 (Task)") + + # STAR + L + situation: str = Field(..., description="[S] 문제상황") + action: str = Field(..., description="[A] 주요 행동") + result: str = Field(..., description="[R] 결과/성과") + learnings: Optional[str] = Field(None, description="배운 점") + + core_competencies: List[str] = Field(..., description="핵심 역량 태그 (예: 문제해결, 기획력 등)") + applicable_questions: List[str] = Field(..., description="활용 가능 문항 (예: 문제해결 경험, 도전 경험 등)") + source_text: str = Field(..., description="원문 출처 (추출의 근거가 된 자소서 원본 일부)") + status: str = Field(default="미확인", description="상태 (미확인, 저장완료, 삭제 등)") + +class ExperienceExtractionResponse(BaseModel): + experiences: List[ExtractedExperience] = Field(..., description="추출된 경험 후보 목록") diff --git a/myeongsung/app/services/experience_extraction_service.py b/myeongsung/app/services/experience_extraction_service.py new file mode 100644 index 0000000..4eb4e7e --- /dev/null +++ b/myeongsung/app/services/experience_extraction_service.py @@ -0,0 +1,90 @@ +import os +import requests +from bs4 import BeautifulSoup +import fitz # PyMuPDF +from langchain_openai import ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate +from app.schemas.resume_dto import ExperienceExtractionResponse + +def extract_experiences_from_text(text: str) -> ExperienceExtractionResponse: + """ + 텍스트 본문(자소서 등)에서 AI를 사용해 경험을 STAR 기반으로 구조화하여 추출합니다. + """ + llm = ChatOpenAI(model="gpt-4o", temperature=0) + + prompt = ChatPromptTemplate.from_messages([ + ("system", ( + "당신은 사용자의 자기소개서(자소서) 원문에서 경험(Experience)을 추출하는 전문가입니다.\n" + "주어진 자소서 내용에서 하나 또는 여러 개의 독립된 경험을 추출하여 구조화된 데이터로 반환하세요.\n\n" + "### 추출 가이드라인:\n" + "1. **경험 분리**: 자소서 하나에 여러 경험(예: 인턴, 해커톤 등)이 섞여 있다면 각각을 분리하여 추출하세요.\n" + "2. **STAR 구조화**: 각 경험에 대해 다음 항목들을 명확히 분류하세요.\n" + " - 경험명 (experience_name)\n" + " - 경험 유형 (experience_type)\n" + " - 기관/소속 (organization)\n" + " - 기간 (period)\n" + " - 나의 역할 (my_role)\n" + " - [S] 문제상황 (situation)\n" + " - [A] 주요 행동 (action)\n" + " - [R] 결과/성과 (result)\n" + " - [L] 배운 점 (learnings)\n" + "3. **역량 태그**: 해당 경험을 통해 어필할 수 있는 핵심 역량(예: 문제해결, 사용자 이해, 기획력, 소익성 등)을 2~4개 추출하여 core_competencies 필드에 저장하세요.\n" + "4. **활용 가능 문항**: 이 경험이 어떤 면접/자소서 문항(예: 갈등 극복, 도전, 직무 역량, 공익 기여 등)에 적합한지 추천하여 applicable_questions 필드에 저장하세요.\n" + "5. **원문 출처**: 해당 경험을 추출한 원문의 실제 문장들을 source_text 필드에 기록하세요.\n" + "6. **상태**: 상태(status)는 기본적으로 '미확인'으로 지정하세요.\n" + )), + ("user", "다음은 자기소개서 내용입니다. 위 가이드라인에 따라 경험을 추출해주세요:\n\n{text}") + ]) + + chain = prompt | llm.with_structured_output(ExperienceExtractionResponse) + + try: + result = chain.invoke({"text": text}, config={"run_name": "experience-extraction"}) + return result + except Exception as e: + raise ValueError(f"경험 추출 중 오류가 발생했습니다: {str(e)}") + + +def extract_experiences_from_url(url: str) -> ExperienceExtractionResponse: + """ + URL에서 텍스트를 추출한 후 경험 추출 로직을 실행합니다. + """ + try: + response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + for script in soup(["script", "style"]): + script.decompose() + + full_text = soup.get_text(separator="\n") + lines = (line.strip() for line in full_text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + full_text = "\n".join(chunk for chunk in chunks if chunk) + + if not full_text.strip(): + raise ValueError("URL에서 유의미한 텍스트를 추출하지 못했습니다.") + + return extract_experiences_from_text(full_text) + except Exception as e: + raise ValueError(f"URL 분석 중 오류가 발생했습니다: {str(e)}") + + +def extract_experiences_from_pdf(file_content: bytes) -> ExperienceExtractionResponse: + """ + PDF 바이너리 데이터에서 PyMuPDF를 사용하여 텍스트를 추출한 후 경험 추출 로직을 실행합니다. + """ + try: + doc = fitz.open(stream=file_content, filetype="pdf") + text_list = [] + for page in doc: + text_list.append(page.get_text()) + full_text = "\n".join(text_list) + + if not full_text.strip(): + raise ValueError("PDF에서 유의미한 텍스트를 추출하지 못했습니다.") + + return extract_experiences_from_text(full_text) + except Exception as e: + raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") + diff --git a/myeongsung/scripts/test_experience_extraction.py b/myeongsung/scripts/test_experience_extraction.py new file mode 100644 index 0000000..fc353d1 --- /dev/null +++ b/myeongsung/scripts/test_experience_extraction.py @@ -0,0 +1,47 @@ +import os +import sys +from dotenv import load_dotenv + +# sys.path에 app 폴더 상위 경로 추가 +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from app.services.experience_extraction_service import extract_experiences_from_text + +# .env 로드 +load_dotenv() + +def test_extraction(): + sample_cover_letter = """ +[문항 1] 본인이 지원한 직무에 대해 어떤 역량을 갖추고 있으며, 이를 발휘한 구체적인 경험을 기술해 주십시오. + +저는 IT융합 해커톤에 참가하여 '경식이 AI 전화 서비스'를 기획하고 우수한 성과를 거둔 경험이 있습니다. +당시 고령층 사용자들이 복잡한 디지털 여가 추천 모바일 앱에 접근하기 어려워한다는 문제상황에 주목했습니다. +저는 이 문제를 해결하기 위해 팀의 서비스 기획 및 사용자 조사를 총괄하며, 팀원들과 함께 인근 노인복지센터 및 공원에 직접 찾아가 현장 인터뷰를 수행했습니다. 현장 실사 과정에서 고령층의 피드백을 통해 단순 터치 방식보다 음성 기반 전화 방식이 훨씬 접근성이 좋다는 사실을 파악하고 기획의 방향을 전환하였습니다. +그 결과, 전화를 걸면 음성 안내와 대화를 통해 실시간 여가 및 복지 정보를 추천해주는 AI 전화를 설계했고, 부산대학교 IT융합 해커톤에서 최종적으로 장려상과 인기상을 동시에 수상하는 쾌거를 이루었습니다. +이 과정에서 사용자 중심의 문제해결 역량과 기획력을 크게 향상시킬 수 있었습니다. +""" + + print("=== 경험 추출 테스트 시작 ===") + try: + result = extract_experiences_from_text(sample_cover_letter) + print("\n[추출 성공!]") + for idx, exp in enumerate(result.experiences, 1): + print(f"\n--- 경험 후보 {idx} ---") + print(f"경험명: {exp.experience_name}") + print(f"경험 유형: {exp.experience_type}") + print(f"기관/소속: {exp.organization}") + print(f"기간: {exp.period}") + print(f"나의 역할: {exp.my_role}") + print(f"S (상황): {exp.situation}") + print(f"A (행동): {exp.action}") + print(f"R (결과): {exp.result}") + print(f"L (배운점): {exp.learnings}") + print(f"핵심 역량 태그: {exp.core_competencies}") + print(f"활용 가능 문항: {exp.applicable_questions}") + print(f"원문 출처: {exp.source_text}") + print(f"상태: {exp.status}") + except Exception as e: + print(f"에러 발생: {e}") + +if __name__ == "__main__": + test_extraction() From a81e132e018c1e40cc0246c2958e5d17dc5cec8c Mon Sep 17 00:00:00 2001 From: tomchccom Date: Sun, 17 May 2026 15:40:17 +0900 Subject: [PATCH 2/6] =?UTF-8?q?test:=20URL=20=EA=B8=B0=EB=B0=98=20?= =?UTF-8?q?=EC=9E=90=EC=86=8C=EC=84=9C=20=EA=B2=BD=ED=97=98=20=EC=B6=94?= =?UTF-8?q?=EC=B6=9C=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20=EC=8A=A4=ED=81=AC?= =?UTF-8?q?=EB=A6=BD=ED=8A=B8=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../scripts/test_experience_extraction_url.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 myeongsung/scripts/test_experience_extraction_url.py diff --git a/myeongsung/scripts/test_experience_extraction_url.py b/myeongsung/scripts/test_experience_extraction_url.py new file mode 100644 index 0000000..892d041 --- /dev/null +++ b/myeongsung/scripts/test_experience_extraction_url.py @@ -0,0 +1,58 @@ +import os +import sys +import argparse +from dotenv import load_dotenv + +# sys.path에 app 폴더 상위 경로 추가 +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from app.services.experience_extraction_service import extract_experiences_from_url + +# .env 로드 +load_dotenv() + +def test_url_extraction(url: str): + print(f"=== URL 경험 추출 테스트 시작 ===") + print(f"타겟 URL: {url}\n") + + try: + result = extract_experiences_from_url(url) + print("[추출 성공!]\n") + + for idx, exp in enumerate(result.experiences, 1): + print(f"--- 경험 후보 {idx} ---") + print(f"경험명: {exp.experience_name}") + print(f"경험 유형: {exp.experience_type}") + print(f"기관/소속: {exp.organization}") + print(f"기간: {exp.period}") + print(f"나의 역할: {exp.my_role}") + print(f"S (상황): {exp.situation}") + print(f"A (행동): {exp.action}") + print(f"R (결과): {exp.result}") + print(f"L (배운점): {exp.learnings}") + print(f"핵심 역량 태그: {exp.core_competencies}") + print(f"활용 가능 문항: {exp.applicable_questions}") + print(f"원문 출처 요약:\n{exp.source_text[:300]}...") + print(f"상태: {exp.status}\n") + + except Exception as e: + print(f"에러 발생: {e}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="URL 기반 자소서 경험 추출 테스트 스크립트") + parser.add_argument( + "--url", + type=str, + default="https://raw.githubusercontent.com/ApptiveDev/pickd_AI/main/README.md", # 기본 임시 URL 또는 테스트용 URL + help="경험을 추출할 자소서 또는 포트폴리오 웹페이지 URL" + ) + + args = parser.parse_args() + + # 예시 실행용 임시 URL이 기본값인 경우 가이드 출력 + if args.url == "https://raw.githubusercontent.com/ApptiveDev/pickd_AI/main/README.md": + print("[안내] --url 인자를 제공하지 않아 테스트용 기본 URL로 실행됩니다.") + print("실제 자소서 URL로 테스트하려면 다음과 같이 실행하세요:") + print("python scripts/test_experience_extraction_url.py --url '원하는_자소서_링크'\n") + + test_url_extraction(args.url) From 241cd56e689f026c058ba2965d21838ef9d93868 Mon Sep 17 00:00:00 2001 From: tomchccom Date: Sun, 17 May 2026 16:30:39 +0900 Subject: [PATCH 3/6] =?UTF-8?q?chore:=20LangSmith=20=ED=8A=B8=EB=9E=98?= =?UTF-8?q?=ED=82=B9=20=EC=84=B1=EB=8A=A5=20=EA=B0=95=ED=99=94=EB=A5=BC=20?= =?UTF-8?q?=EC=9C=84=ED=95=9C=20=ED=83=9C=EA=B7=B8=20=EB=B0=8F=20=EB=A9=94?= =?UTF-8?q?=ED=83=80=EB=8D=B0=EC=9D=B4=ED=84=B0=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/services/experience_extraction_service.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/myeongsung/app/services/experience_extraction_service.py b/myeongsung/app/services/experience_extraction_service.py index 4eb4e7e..35099d2 100644 --- a/myeongsung/app/services/experience_extraction_service.py +++ b/myeongsung/app/services/experience_extraction_service.py @@ -39,7 +39,18 @@ def extract_experiences_from_text(text: str) -> ExperienceExtractionResponse: chain = prompt | llm.with_structured_output(ExperienceExtractionResponse) try: - result = chain.invoke({"text": text}, config={"run_name": "experience-extraction"}) + result = chain.invoke( + {"text": text}, + config={ + "run_name": "experience-extraction", + "tags": ["experience-extraction", "cover-letter", "ncs-public"], + "metadata": { + "model": "gpt-4o", + "extraction_format": "STAR+L", + "project": "pickd" + } + } + ) return result except Exception as e: raise ValueError(f"경험 추출 중 오류가 발생했습니다: {str(e)}") From e804198bcb0e8560e810cddea28ff4c4f89e26fc Mon Sep 17 00:00:00 2001 From: tomchccom Date: Sun, 17 May 2026 22:25:53 +0900 Subject: [PATCH 4/6] =?UTF-8?q?test:=20=EA=B3=B5=EA=B8=B0=EC=97=85=20?= =?UTF-8?q?=EC=9E=90=EC=86=8C=EC=84=9C=20=EB=B0=B0=EC=B9=98=20=ED=85=8C?= =?UTF-8?q?=EC=8A=A4=ED=8A=B8=20=EC=8A=A4=ED=81=AC=EB=A6=BD=ED=8A=B8=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20=EB=B0=8F=20root=20pdf=20=EB=AC=B4?= =?UTF-8?q?=EC=8B=9C=20=EC=84=A4=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- myeongsung/.gitignore | 1 + .../test_batch_experience_extraction.py | 85 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 myeongsung/scripts/test_batch_experience_extraction.py diff --git a/myeongsung/.gitignore b/myeongsung/.gitignore index 97a3514..b6aa5b3 100644 --- a/myeongsung/.gitignore +++ b/myeongsung/.gitignore @@ -42,3 +42,4 @@ wheels/ data/ ai_job_extraction_api_spec.md tests/results/ +/*.pdf diff --git a/myeongsung/scripts/test_batch_experience_extraction.py b/myeongsung/scripts/test_batch_experience_extraction.py new file mode 100644 index 0000000..5a5bc91 --- /dev/null +++ b/myeongsung/scripts/test_batch_experience_extraction.py @@ -0,0 +1,85 @@ +import os +import sys +import json +import fitz # PyMuPDF +from dotenv import load_dotenv + +# sys.path에 app 폴더 상위 경로 추가 +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from app.services.experience_extraction_service import extract_experiences_from_text + +# .env 로드 +load_dotenv() + +def run_batch_extraction(): + # 타겟 파일 리스트 + target_files = [ + "국민건강.pdf", + "한국거래소.pdf", + "한국전력공사 합격자소서.pdf", + "한전2.pdf" + ] + + workspace_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + results_dir = os.path.join(workspace_dir, "tests", "results") + os.makedirs(results_dir, exist_ok=True) + + all_results = {} + + print("=== 공기업 자소서 경험 추출 배치 테스트 시작 ===\n") + + for filename in target_files: + filepath = os.path.join(workspace_dir, filename) + if not os.path.exists(filepath): + print(f"⚠️ 파일을 찾을 수 없습니다: {filename} (경로: {filepath}) - 건너뜁니다.") + continue + + print(f"🔄 분석 중: {filename}...") + try: + # PyMuPDF로 텍스트 추출 + doc = fitz.open(filepath) + text_list = [] + for page in doc: + text_list.append(page.get_text()) + full_text = "\n".join(text_list) + + if not full_text.strip(): + print(f"❌ {filename}에서 텍스트를 추출하지 못했습니다.") + continue + + # 경험 추출 실행 + extraction_result = extract_experiences_from_text(full_text) + + # 결과 저장 + experiences_list = [] + for exp in extraction_result.experiences: + experiences_list.append(exp.model_dump()) + + all_results[filename] = { + "status": "SUCCESS", + "extracted_count": len(experiences_list), + "experiences": experiences_list + } + + print(f"✅ 완료: {filename} (추출된 경험 후보: {len(experiences_list)}개)") + + except Exception as e: + all_results[filename] = { + "status": "FAILED", + "error": str(e) + } + print(f"❌ 실패: {filename} (오류: {e})") + + print("-" * 50) + + # 결과 JSON 저장 + output_path = os.path.join(results_dir, "experience_extraction_report.json") + with open(output_path, "w", encoding="utf-8") as f: + json.dump(all_results, f, ensure_ascii=False, indent=2) + + print(f"\n🎉 모든 배치 분석이 완료되었습니다!") + print(f"📊 결과 보고서 저장 위치: {output_path}") + +if __name__ == "__main__": + run_batch_extraction() From ba5ad5a2f3a12642292ddbd972cb2bd7e12e2789 Mon Sep 17 00:00:00 2001 From: tomchccom Date: Sun, 17 May 2026 22:53:10 +0900 Subject: [PATCH 5/6] =?UTF-8?q?chore:=20=EC=9E=90=EC=86=8C=EC=84=9C=20?= =?UTF-8?q?=EA=B2=BD=ED=97=98=20=EC=B6=94=EC=B6=9C=20API=20=EB=AA=85?= =?UTF-8?q?=EC=84=B8=EC=84=9C=20=ED=8C=8C=EC=9D=BC=20=EB=AC=B4=EC=8B=9C=20?= =?UTF-8?q?=EA=B7=9C=EC=B9=99=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- myeongsung/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/myeongsung/.gitignore b/myeongsung/.gitignore index b6aa5b3..06ce65c 100644 --- a/myeongsung/.gitignore +++ b/myeongsung/.gitignore @@ -41,5 +41,6 @@ wheels/ # Data and API specs data/ ai_job_extraction_api_spec.md +ai_experience_extraction_api_spec.md tests/results/ /*.pdf From b0b8fdf606e41207276cf3aa6816b2446d25d846 Mon Sep 17 00:00:00 2001 From: mongdmin Date: Mon, 18 May 2026 15:54:30 +0900 Subject: [PATCH 6/6] =?UTF-8?q?ai=20=EC=97=B0=EB=8F=99=20=EC=8B=9C?= =?UTF-8?q?=EC=97=90=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + .../app/services/job_analysis_service.py | 108 +++++++----------- myeongsung/logs/evaluation_history.jsonl | 12 ++ myeongsung/test_api.sh | 12 ++ 4 files changed, 70 insertions(+), 65 deletions(-) create mode 100755 myeongsung/test_api.sh diff --git a/.gitignore b/.gitignore index 303a196..0895c29 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ .cursor/ + +# Environment Variables +.env diff --git a/myeongsung/app/services/job_analysis_service.py b/myeongsung/app/services/job_analysis_service.py index 2a4c381..5833afc 100644 --- a/myeongsung/app/services/job_analysis_service.py +++ b/myeongsung/app/services/job_analysis_service.py @@ -71,7 +71,7 @@ def _analyze_with_vision(image_url: str, google_api_key: str) -> Optional[JobPos image = PIL.Image.open(io.BytesIO(img_response.content)) response = client.models.generate_content( - model='gemini-2.0-flash', + model='gemini-2.5-flash', contents=[image, _VISION_SYSTEM_PROMPT], config=types.GenerateContentConfig( response_mime_type='application/json', @@ -127,74 +127,52 @@ def _analyze_with_text(markdown: str) -> Optional[JobPostingCreate]: # 4. 지능적 병합 엔진 # ────────────────────────────────────────────── -def _smart_merge(text_result: Optional[JobPostingCreate], +def _smart_merge(text_result: Optional[JobPostingCreate], vision_result: Optional[JobPostingCreate]) -> JobPostingCreate: """ 텍스트 엔진과 비전 엔진의 결과를 지능적으로 병합. 원칙: 텍스트 결과를 기본으로 하되, 비전 결과가 더 풍부한 필드는 비전 결과를 채택. """ - # 둘 다 없으면 에러 - if not text_result and not vision_result: - raise ValueError("텍스트 분석과 비전 분석 모두 실패했습니다.") - - # 하나만 있으면 그것을 사용 - if not text_result: - return vision_result - if not vision_result: - return text_result - - merged = text_result.model_copy(deep=True) - - # sections: 비전이 더 많은 부문을 발견했으면 비전 결과 채택 - if len(vision_result.sections) > len(merged.sections): - merged.sections = vision_result.sections - - # processes: 비어있으면 비전에서 가져옴 - if not merged.processes and vision_result.processes: - merged.processes = vision_result.processes - - # documents - if not merged.documents and vision_result.documents: - merged.documents = vision_result.documents - - # company_info: 비전이 더 풍부하면 채택 - if vision_result.company_info: - if not merged.company_info: - merged.company_info = vision_result.company_info - else: - # 개별 필드 단위로 보완 - for field_name in vision_result.company_info.model_fields: - vision_val = getattr(vision_result.company_info, field_name, None) - merged_val = getattr(merged.company_info, field_name, None) - if vision_val and not merged_val: - setattr(merged.company_info, field_name, vision_val) - - # guideline - if vision_result.guideline: - if not merged.guideline: - merged.guideline = vision_result.guideline - else: - for field_name in vision_result.guideline.model_fields: - vision_val = getattr(vision_result.guideline, field_name, None) - merged_val = getattr(merged.guideline, field_name, None) - if vision_val and not merged_val: - setattr(merged.guideline, field_name, vision_val) - - # 단순 필드 보완 (비어있으면 비전에서 가져옴) - for field_name in ["employment_type", "headcount", "region_1depth", "workplace_address", "notice_url"]: - merged_val = getattr(merged, field_name, None) - vision_val = getattr(vision_result, field_name, None) - if not merged_val and vision_val: - setattr(merged, field_name, vision_val) - - # citations 합치기 (중복 제거) - existing_contents = {c.content[:50] for c in merged.citations} - for cit in vision_result.citations: - if cit.content[:50] not in existing_contents: - merged.citations.append(cit) - - return merged - + try: + # 둘 다 없으면 에러 + if not text_result and not vision_result: + raise ValueError("텍스트 분석과 비전 분석 모두 실패했습니다.") + + # 하나만 있으면 그것을 사용 + if not text_result: + return vision_result + if not vision_result: + return text_result + + merged = text_result.model_copy(deep=True) + + # 1. 텍스트 정보가 부족할 수 있는 모집 부문(sections) 보완 + if vision_result.sections and len(vision_result.sections) > len(merged.sections): + merged.sections = vision_result.sections + + # 2. 전형 절차(processes) 및 서류(documents) 보완 + if not merged.processes and vision_result.processes: + merged.processes = vision_result.processes + if not merged.documents and vision_result.documents: + merged.documents = vision_result.documents + + # 3. 최상위 필드 보완 (기업명, 공고명, 고용형태 등) + for field_name in ["company_name", "notice_name", "employment_type", "headcount", "region_1depth", "workplace_address", "notice_url"]: + merged_val = getattr(merged, field_name, None) + vision_val = getattr(vision_result, field_name, None) + if (merged_val is None or merged_val == "" or merged_val == 0) and vision_val: + setattr(merged, field_name, vision_val) + + # 4. citations 합치기 (중복 제거 및 None 체크) + existing_contents = {c.content[:50] for c in merged.citations if c.content} + for cit in vision_result.citations: + if cit.content and cit.content[:50] not in existing_contents: + merged.citations.append(cit) + + return merged + except Exception as e: + print(f"[!] 결과 병합 중 오류 발생: {e}") + return text_result if text_result else vision_result # ────────────────────────────────────────────── # 5. 메인 엔트리포인트 @@ -231,7 +209,7 @@ def analyze_job_url(url: str) -> JobPostingCreate: # 3. 비전 엔진 실행 (항상 실행하여 보완) vision_result = None if screenshot_url and google_api_key: - print("[*] 비전 엔진(Gemini 2.0 Flash) 분석 중...") + print("[*] 비전 엔진(Gemini 2.5 Flash) 분석 중...") vision_result = _analyze_with_vision(screenshot_url, google_api_key) # 4. 지능적 병합 diff --git a/myeongsung/logs/evaluation_history.jsonl b/myeongsung/logs/evaluation_history.jsonl index 4c51132..0ceff6b 100644 --- a/myeongsung/logs/evaluation_history.jsonl +++ b/myeongsung/logs/evaluation_history.jsonl @@ -1 +1,13 @@ {"timestamp": "2026-05-04T01:49:04.795016", "case_name": "https://www.saramin.co.kr/zf_user/jobs/relay/view?view_type=list&rec_idx=53682014&t_ref=theme-hmgpartnerjob&t_ref_content=logo_recruit&t_ref_area=411#seq=0", "overall_score": 0.95, "metrics": {"overall_score": 0.95, "completeness_score": 0.9, "accuracy_score": 0.95, "structure_score": 1.0, "hallucination_score": 0.95, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "employment_type", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "posted_at", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "started_at", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "ended_at", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "notice_url", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "Extracted as null, but should be '0명'."}, {"field_name": "region_1depth", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "sections", "score": 1.0, "reason": "All section details are correctly extracted."}, {"field_name": "processes", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "documents", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "company_info", "score": 1.0, "reason": "Correctly extracted."}, {"field_name": "guideline", "score": 1.0, "reason": "Correctly extracted."}], "summary": "The extraction is highly accurate and well-structured, capturing almost all details correctly from the source content.", "improvements": ["Ensure the 'headcount' field is extracted correctly as '0명' instead of null."]}, "extraction_summary": {"company": "(주)유니테크", "notice": "[울산]B2B영업 신입 채용(자동차 및 조선산업군)"}} +{"timestamp": "2026-05-18T13:37:40.454028", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.825, "metrics": {"overall_score": 0.825, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.9, "hallucination_score": 0.9, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 1.0, "reason": "The 'ended_at' field is null, which is acceptable if the information is not provided."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 1.0, "reason": "The headcount is null, which is acceptable if the information is not provided."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.8, "reason": "The sections are mostly complete, but there might be additional details in the source that are not captured."}, {"field_name": "processes", "score": 1.0, "reason": "The processes field is empty, which is acceptable if no information is provided."}, {"field_name": "documents", "score": 1.0, "reason": "The documents field is empty, which is acceptable if no information is provided."}, {"field_name": "citations", "score": 0.9, "reason": "Citations are mostly accurate, but there might be minor discrepancies in the content or missing citations."}], "summary": "The extracted JSON captures most of the key information accurately, but there are some issues with completeness and minor errors in certain fields.", "improvements": ["Ensure the 'started_at' field is correctly extracted and not a placeholder.", "Verify if there are additional details in the 'sections' that could be included.", "Check for any missing citations or discrepancies in the citation content."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T13:38:17.693944", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.75, "metrics": {"overall_score": 0.75, "completeness_score": 0.8, "accuracy_score": 0.7, "structure_score": 0.9, "hallucination_score": 0.6, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder or error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are mostly complete and accurate, but there might be minor omissions or errors."}, {"field_name": "processes", "score": 0.5, "reason": "The processes field is empty, which might be due to missing information in the source."}, {"field_name": "documents", "score": 0.5, "reason": "The documents field is empty, which might be due to missing information in the source."}, {"field_name": "citations", "score": 0.8, "reason": "Citations are mostly accurate, but there might be minor issues with completeness or accuracy."}], "summary": "The extracted JSON captures most of the key information from the source content, but there are some issues with completeness and accuracy, particularly with the 'started_at' field and missing information in 'employment_type', 'ended_at', 'headcount', 'processes', and 'documents'.", "improvements": ["Correct the 'started_at' field to reflect the actual start date or remove incorrect placeholder text.", "Ensure all fields are populated with available information from the source, particularly 'employment_type', 'ended_at', 'headcount', 'processes', and 'documents'.", "Verify the accuracy of all extracted fields to ensure they match the source content."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T13:47:25.113127", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.8, "metrics": {"overall_score": 0.8, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.9, "hallucination_score": 0.8, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are well-structured and contain relevant information, but some fields like 'processes' and 'documents' are empty."}], "summary": "The extraction captures most of the key information accurately, but there are issues with some fields being null or incorrect, such as 'started_at'. The structure is generally good, but completeness is affected by missing fields.", "improvements": ["Correct the 'started_at' field to reflect the actual start date or leave it null if not available.", "Ensure that fields like 'employment_type', 'ended_at', and 'headcount' are filled if the information is available in the source.", "Verify that all sections are complete and accurately reflect the source content."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T14:19:24.063228", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.8, "metrics": {"overall_score": 0.8, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.9, "hallucination_score": 0.8, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are mostly complete and accurate, but there might be missing details in preferences."}, {"field_name": "processes", "score": 0.5, "reason": "The processes field is empty, which might be due to missing information in the source."}, {"field_name": "documents", "score": 0.5, "reason": "The documents field is empty, which might be due to missing information in the source."}, {"field_name": "citations", "score": 1.0, "reason": "Citations are correctly extracted and linked to the source URL."}], "summary": "The extraction captures most of the key information accurately, but there are issues with some fields being null or incorrect, such as 'started_at'. The structure is well-organized, but completeness is affected by missing fields like 'processes' and 'documents'.", "improvements": ["Correct the 'started_at' field to reflect the actual start date or remove incorrect placeholder text.", "Ensure all fields are populated if the information is available in the source, such as 'employment_type', 'ended_at', 'headcount', 'processes', and 'documents'.", "Verify the completeness of the 'preferences' section to ensure all relevant preferences are captured."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T14:30:47.523939", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.75, "metrics": {"overall_score": 0.75, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.8, "hallucination_score": 0.7, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains incorrect data '/OpenStreetMap', which is not relevant."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.8, "reason": "The sections are mostly complete, but some fields like 'processes' and 'documents' are missing."}, {"field_name": "processes", "score": 0.0, "reason": "The processes field is empty, indicating missing information."}, {"field_name": "documents", "score": 0.0, "reason": "The documents field is empty, indicating missing information."}, {"field_name": "citations", "score": 0.8, "reason": "Citations are mostly correct, but some fields like 'bbox', 'element_id', 'page_width', and 'page_height' are null."}], "summary": "The extracted JSON captures most of the key information from the source content, but there are some missing fields and inaccuracies.", "improvements": ["Correct the 'started_at' field to reflect accurate information.", "Ensure 'processes' and 'documents' fields are populated if applicable.", "Verify and complete the 'employment_type', 'ended_at', and 'headcount' fields if information is available.", "Improve the citation details by providing complete metadata."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:35:09.657855", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.8, "metrics": {"overall_score": 0.8, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.9, "hallucination_score": 0.8, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are well-structured and contain relevant information, but some fields like 'headcount' are null."}, {"field_name": "processes", "score": 0.5, "reason": "The processes field is empty, which might be due to missing information in the source."}, {"field_name": "documents", "score": 0.5, "reason": "The documents field is empty, which might be due to missing information in the source."}, {"field_name": "citations", "score": 1.0, "reason": "Citations are correctly extracted and linked to the source URL."}], "summary": "The extracted JSON captures most of the key information from the source content, but there are some issues with missing or incorrect data, particularly in the 'started_at' field and null values for 'employment_type', 'ended_at', 'headcount', 'processes', and 'documents'.", "improvements": ["Correct the 'started_at' field to reflect the actual start date or remove incorrect placeholder text.", "Ensure that fields like 'employment_type', 'ended_at', 'headcount', 'processes', and 'documents' are populated if the information is available in the source.", "Verify that all extracted fields are accurate and complete, especially those with null values."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:36:42.007295", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.825, "metrics": {"overall_score": 0.825, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.9, "hallucination_score": 0.9, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are mostly complete and accurate, but there might be minor omissions or errors."}], "summary": "The extraction is mostly accurate and well-structured, but there are some issues with completeness and minor errors in certain fields.", "improvements": ["Correct the 'started_at' field to reflect the actual start date or remove incorrect placeholder text.", "Ensure all fields are populated if the information is available in the source.", "Verify the accuracy of the 'employment_type' and 'headcount' fields."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:39:12.296703", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.75, "metrics": {"overall_score": 0.75, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.8, "hallucination_score": 0.7, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which may indicate missing information if it was present in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder or error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which may be correct if the information was not provided, but it's unclear."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which may indicate missing information if it was present in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.8, "reason": "The sections are mostly complete, but there might be missing details in preferences and processes."}, {"field_name": "processes", "score": 0.5, "reason": "The processes field is empty, which may indicate missing information if it was present in the source."}, {"field_name": "documents", "score": 0.5, "reason": "The documents field is empty, which may indicate missing information if it was present in the source."}, {"field_name": "citations", "score": 0.8, "reason": "Citations are mostly correct, but there might be missing or incomplete citation details."}], "summary": "The extracted JSON captures most of the key information from the source content, but there are some issues with missing or incorrect data, particularly in the 'started_at' field and potentially missing fields like 'employment_type', 'headcount', 'processes', and 'documents'.", "improvements": ["Correct the 'started_at' field to reflect accurate information.", "Ensure all relevant fields such as 'employment_type', 'headcount', 'processes', and 'documents' are filled if the information is available.", "Verify and complete the 'citations' field to ensure all necessary citation details are included."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:41:04.054001", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.75, "metrics": {"overall_score": 0.75, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.8, "hallucination_score": 0.7, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is marked as null, which may indicate missing information."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains incorrect data '/OpenStreetMap', which is not relevant."}, {"field_name": "ended_at", "score": 1.0, "reason": "The 'ended_at' field is correctly marked as null, assuming no end date is provided."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 1.0, "reason": "The headcount is correctly marked as null, assuming no information is provided."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.8, "reason": "The sections are mostly complete, but some fields like 'processes' and 'documents' are missing."}, {"field_name": "processes", "score": 0.0, "reason": "The 'processes' field is empty, indicating missing information."}, {"field_name": "documents", "score": 0.0, "reason": "The 'documents' field is empty, indicating missing information."}, {"field_name": "citations", "score": 1.0, "reason": "Citations are correctly extracted and linked to the source URL."}], "summary": "The extraction captures most of the key information accurately, but there are some issues with missing fields and incorrect data in 'started_at'.", "improvements": ["Correct the 'started_at' field to reflect accurate information.", "Include information for 'processes' and 'documents' if available.", "Ensure 'employment_type' is accurately captured if specified in the source."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:41:46.472720", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.75, "metrics": {"overall_score": 0.75, "completeness_score": 0.8, "accuracy_score": 0.7, "structure_score": 0.9, "hallucination_score": 0.6, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are mostly complete and accurate, but there might be minor omissions or errors."}, {"field_name": "processes", "score": 0.5, "reason": "The processes field is empty, which might be due to missing information in the source."}, {"field_name": "documents", "score": 0.5, "reason": "The documents field is empty, which might be due to missing information in the source."}, {"field_name": "citations", "score": 0.8, "reason": "Citations are mostly accurate, but there might be minor errors or omissions."}], "summary": "The extracted JSON captures most of the key information from the source content, but there are some inaccuracies and missing fields. The 'started_at' field contains an incorrect placeholder, and several fields are null, possibly due to missing information in the source. The structure is well-organized, but hallucination issues are present due to incorrect data in 'started_at'.", "improvements": ["Correct the 'started_at' field to remove the placeholder '/OpenStreetMap'.", "Ensure all fields are populated with accurate data, or confirm if null values are due to missing source information.", "Verify the completeness of sections and citations to ensure no information is omitted."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:48:38.197801", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.8, "metrics": {"overall_score": 0.8, "completeness_score": 0.7, "accuracy_score": 0.8, "structure_score": 0.9, "hallucination_score": 0.8, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 1.0, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is correctly extracted."}, {"field_name": "category", "score": 1.0, "reason": "The category 'FULL_TIME' is correctly extracted."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type is null, which might be due to missing information in the source."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field contains '/OpenStreetMap', which is incorrect and likely a placeholder error."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which might be due to missing information in the source."}, {"field_name": "notice_url", "score": 1.0, "reason": "The notice URL is correctly extracted."}, {"field_name": "headcount", "score": 0.5, "reason": "The headcount is null, which might be due to missing information in the source."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address is correctly extracted."}, {"field_name": "sections", "score": 0.9, "reason": "The sections are well-structured and mostly complete, but some fields like 'processes' and 'documents' are empty."}], "summary": "The extraction is mostly accurate and well-structured, but there are some issues with completeness and minor hallucinations.", "improvements": ["Correct the 'started_at' field to reflect accurate information.", "Ensure all fields are populated if the information is available in the source.", "Verify the 'employment_type' and 'headcount' fields for completeness."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} +{"timestamp": "2026-05-18T15:52:13.068523", "case_name": "https://www.wanted.co.kr/wd/231144", "overall_score": 0.7000000000000001, "metrics": {"overall_score": 0.7000000000000001, "completeness_score": 0.7, "accuracy_score": 0.7, "structure_score": 0.8, "hallucination_score": 0.6, "field_scores": [{"field_name": "company_name", "score": 1.0, "reason": "The company name '리체' is correctly extracted."}, {"field_name": "notice_name", "score": 0.5, "reason": "The notice name 'MD (Platform & Seller Growth Manager)' is extracted, but there is a discrepancy with '[로마드] 리빙 플랫폼 MD(5년 이상)' found in the citations."}, {"field_name": "category", "score": 0.5, "reason": "The category 'FULL_TIME' is extracted, but the citation suggests a different context '[로마드] 리빙 플랫폼 MD(5년 이상)'."}, {"field_name": "employment_type", "score": 0.5, "reason": "The employment type '정규직' is extracted, but the citation suggests a different context '[로마드] 리빙 플랫폼 MD(5년 이상)'."}, {"field_name": "started_at", "score": 0.0, "reason": "The 'started_at' field is incorrectly extracted as '/OpenStreetMap', which is not relevant."}, {"field_name": "ended_at", "score": 0.5, "reason": "The 'ended_at' field is null, which is partially correct as the position is closed, but not explicitly stated."}, {"field_name": "notice_url", "score": 0.5, "reason": "The notice URL is incorrect; it should match the source URL provided in the citations."}, {"field_name": "region_1depth", "score": 1.0, "reason": "The region '서울특별시' is correctly extracted."}, {"field_name": "workplace_address", "score": 1.0, "reason": "The workplace address '서울특별시 서초구 바우뫼로 162 우광빌딩' is correctly extracted."}, {"field_name": "sections", "score": 0.8, "reason": "The sections are mostly correct, but there are discrepancies in job titles and responsibilities compared to the citations."}], "summary": "The extraction captures several key fields correctly, such as company name and workplace address. However, there are discrepancies in the notice name, category, and employment type. The 'started_at' field is incorrectly extracted, and the notice URL does not match the source. The structure is mostly correct, but some fields contain hallucinated or incorrect data.", "improvements": ["Correct the 'started_at' field to reflect the actual start date or status.", "Ensure the notice name matches the source content accurately.", "Verify the notice URL to ensure it matches the source.", "Clarify the category and employment type based on the source content.", "Remove any irrelevant or hallucinated data, such as '/OpenStreetMap'."]}, "extraction_summary": {"company": "리체", "notice": "MD (Platform & Seller Growth Manager)"}} diff --git a/myeongsung/test_api.sh b/myeongsung/test_api.sh new file mode 100755 index 0000000..b3f4aa5 --- /dev/null +++ b/myeongsung/test_api.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# 테스트용 데이터 +JD_URL="https://www.wanted.co.kr/wd/208424" +USER_PERSONA="기존에 하던 것을 끈기 있게 계속 이어가며 마무리하는 책임감 있는 스타일." +EXPERIENCES='[{"id":"1","title":"미래에셋 AI Agent 개발","priority":"상","tags":["AI","RAG"],"star":{"situation":"금융 은어 검색 품질이 낮은 문제 발생","task":"은어를 공식 종목명으로 변환하는 파이프라인 구축","action":"HyperCLOVA X Reranker 및 앙상블 Retriever 도입","result":"금융 용어 변환 성공 및 검색 품질 개선"}}]' +PROMPTS='["지원 직무와 관련하여 기술적 문제 해결 과정을 서술해 주세요."]' + +echo "[*] AI 서버로 분석 요청을 보냅니다 (1~2분 소요)..." + +# 모든 옵션을 한 줄로 연결하여 실행 +curl -s -X POST "http://127.0.0.1:8000/analyze-and-place" -F "jd_url=$JD_URL" -F "user_persona=$USER_PERSONA" -F "experiences_json=$EXPERIENCES" -F "essay_prompts_json=$PROMPTS" \ No newline at end of file