Skip to content

Commit 9357009

Browse files
committed
revise code
change file name revise crawler code
1 parent c12b7e0 commit 9357009

10 files changed

+1239
-1957
lines changed

Session_B/answer/03_BeautifulSoup-regular_expression_answer.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@
206206
},
207207
{
208208
"cell_type": "code",
209-
"execution_count": 8,
209+
"execution_count": 2,
210210
"metadata": {
211211
"collapsed": true
212212
},
@@ -219,7 +219,7 @@
219219
},
220220
{
221221
"cell_type": "code",
222-
"execution_count": 9,
222+
"execution_count": 3,
223223
"metadata": {},
224224
"outputs": [
225225
{
@@ -232,7 +232,7 @@
232232
],
233233
"source": [
234234
"# your codes\n",
235-
"response = requests.get(\"http://yp.518.com.tw/service-life.html?ctf=10\")\n",
235+
"response = requests.get(\"https://jimmy15923.github.io/518\")\n",
236236
"soup = BeautifulSoup(response.text, 'lxml')\n",
237237
"print(soup.find_all(\"li\",class_=\"comp_loca\", text = re.compile(\"新北\")))"
238238
]

Session_B/answer/05_crawler_final_practice_answer.ipynb

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -547,14 +547,14 @@
547547
},
548548
{
549549
"cell_type": "code",
550-
"execution_count": 22,
550+
"execution_count": 11,
551551
"metadata": {},
552552
"outputs": [
553553
{
554554
"name": "stdout",
555555
"output_type": "stream",
556556
"text": [
557-
"正在抓取玩命關頭電影評論第 4 "
557+
"正在抓取玩命關頭電影評論第 35"
558558
]
559559
},
560560
{
@@ -585,56 +585,56 @@
585585
" </thead>\n",
586586
" <tbody>\n",
587587
" <tr>\n",
588-
" <th>0</th>\n",
589-
" <td>+LINE:dd963+台/北/外/送/茶/莊+男人尋樂秘密基地【看/主/頁/約/妹】\\r\\...</td>\n",
588+
" <th>345</th>\n",
589+
" <td>已經跳脫劇情好與不好\\r\\n另一種意境了\\r\\n永遠愛</td>\n",
590590
" <td>玩命關頭8\\n Fast &amp; Furious 8\\n</td>\n",
591591
" <td>5</td>\n",
592592
" </tr>\n",
593593
" <tr>\n",
594-
" <th>1</th>\n",
595-
" <td>第八集裏頭Diesel想讓核心人物唐老大一個較合適且灰暗的轉變,Diesel說:「我知道必須...</td>\n",
594+
" <th>346</th>\n",
595+
" <td>無冷場</td>\n",
596596
" <td>玩命關頭8\\n Fast &amp; Furious 8\\n</td>\n",
597-
" <td>3</td>\n",
597+
" <td>5</td>\n",
598598
" </tr>\n",
599599
" <tr>\n",
600-
" <th>2</th>\n",
601-
" <td>你是不是时常羡慕别人有着漂亮女友\\r\\n也希望自己能跟着漂亮女生做爱~\\r\\n悠悠让你自己...</td>\n",
600+
" <th>347</th>\n",
601+
" <td>為了家人,你願意付出多少\\n再一次在電影院裡哭到不能自己 謝謝玩命關頭 對我來說不只是爽片...</td>\n",
602602
" <td>玩命關頭8\\n Fast &amp; Furious 8\\n</td>\n",
603603
" <td>5</td>\n",
604604
" </tr>\n",
605605
" <tr>\n",
606-
" <th>3</th>\n",
607-
" <td>爽片也要有一點劇情吧..........</td>\n",
606+
" <th>348</th>\n",
607+
" <td>原來唐老大變成壞人是被心靈控制,然後唐老大為了保護家人自己死了,看了都快哭了,然後最後韓哥復...</td>\n",
608608
" <td>玩命關頭8\\n Fast &amp; Furious 8\\n</td>\n",
609609
" <td>1</td>\n",
610610
" </tr>\n",
611611
" <tr>\n",
612-
" <th>4</th>\n",
613-
" <td>劇情方面越來越狗血 一直家庭! 家庭! 家庭!\\r\\n不過爽度有 就綜合一下給兩顆星</td>\n",
612+
" <th>349</th>\n",
613+
" <td>WOW!!! SO AMAZING!!! :)\\r\\nWhat a great movie!...</td>\n",
614614
" <td>玩命關頭8\\n Fast &amp; Furious 8\\n</td>\n",
615-
" <td>2</td>\n",
615+
" <td>5</td>\n",
616616
" </tr>\n",
617617
" </tbody>\n",
618618
"</table>\n",
619619
"</div>"
620620
],
621621
"text/plain": [
622-
" comments \\\n",
623-
"0 +LINE:dd963+台/北/外/送/茶/莊+男人尋樂秘密基地【看/主/頁/約/妹】\\r\\... \n",
624-
"1 第八集裏頭Diesel想讓核心人物唐老大一個較合適且灰暗的轉變,Diesel說:「我知道必須... \n",
625-
"2 你是不是时常羡慕别人有着漂亮女友\\r\\n也希望自己能跟着漂亮女生做爱~\\r\\n悠悠让你自己... \n",
626-
"3 爽片也要有一點劇情吧.......... \n",
627-
"4 劇情方面越來越狗血 一直家庭! 家庭! 家庭!\\r\\n不過爽度有 就綜合一下給兩顆星 \n",
622+
" comments \\\n",
623+
"345 已經跳脫劇情好與不好\\r\\n另一種意境了\\r\\n永遠愛 \n",
624+
"346 無冷場 \n",
625+
"347 為了家人,你願意付出多少\\n再一次在電影院裡哭到不能自己 謝謝玩命關頭 對我來說不只是爽片... \n",
626+
"348 原來唐老大變成壞人是被心靈控制,然後唐老大為了保護家人自己死了,看了都快哭了,然後最後韓哥復... \n",
627+
"349 WOW!!! SO AMAZING!!! :)\\r\\nWhat a great movie!... \n",
628628
"\n",
629-
" movie star \n",
630-
"0 玩命關頭8\\n Fast & Furious 8\\n 5 \n",
631-
"1 玩命關頭8\\n Fast & Furious 8\\n 3 \n",
632-
"2 玩命關頭8\\n Fast & Furious 8\\n 5 \n",
633-
"3 玩命關頭8\\n Fast & Furious 8\\n 1 \n",
634-
"4 玩命關頭8\\n Fast & Furious 8\\n 2 "
629+
" movie star \n",
630+
"345 玩命關頭8\\n Fast & Furious 8\\n 5 \n",
631+
"346 玩命關頭8\\n Fast & Furious 8\\n 5 \n",
632+
"347 玩命關頭8\\n Fast & Furious 8\\n 5 \n",
633+
"348 玩命關頭8\\n Fast & Furious 8\\n 1 \n",
634+
"349 玩命關頭8\\n Fast & Furious 8\\n 5 "
635635
]
636636
},
637-
"execution_count": 22,
637+
"execution_count": 11,
638638
"metadata": {},
639639
"output_type": "execute_result"
640640
}
@@ -660,7 +660,7 @@
660660
"\n",
661661
"# 對每頁的評論送 requests,並把評論文字、星等抓下來,存進剛剛建好的空 list\n",
662662
"for i in range(1, page):\n",
663-
" sys.stdout.write(\"\\r正在抓取玩命關頭電影評論第 \" + str(i) + \" \")\n",
663+
" sys.stdout.write(\"\\r正在抓取玩命關頭電影評論第 \" + str(i) + \"\")\n",
664664
" response = requests.get(\"https://tw.movies.yahoo.com/movieinfo_review.html/id=6664?sort=update_ts&order=desc&page=\" + str(i) )\n",
665665
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
666666
"\n",
@@ -677,8 +677,8 @@
677677
" \"movie\":movie_name,\n",
678678
" \"star\":star_all})\n",
679679
"\n",
680-
"# 看前五筆資料\n",
681-
"comment_df.head()"
680+
"# 看最後五筆資料\n",
681+
"comment_df.tail()"
682682
]
683683
}
684684
],

Session_B/practice/03_BeautifulSoup+regular_expression.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,13 +180,13 @@
180180
},
181181
{
182182
"cell_type": "code",
183-
"execution_count": 8,
184-
"metadata": {
185-
"collapsed": true
186-
},
183+
"execution_count": null,
184+
"metadata": {},
187185
"outputs": [],
188186
"source": [
189-
"# your codes\n"
187+
"# your codes\n",
188+
"## 518 網頁伺服器無法容納多人同時 requests,請大家使用以下的網頁作 requests,其 html 的內容是一模一樣的\n",
189+
"response = requests.get(\"https://jimmy15923.github.io/518\")"
190190
]
191191
}
192192
],

Session_B/practice/05_crawler_final_practice.ipynb

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,13 @@
361361
"outputs": [],
362362
"source": [
363363
"# your codes\n",
364+
"\n",
365+
"\n",
366+
"\n",
367+
"\n",
368+
"\n",
369+
"\n",
370+
"\n",
364371
"\n"
365372
]
366373
},
@@ -399,6 +406,12 @@
399406
"outputs": [],
400407
"source": [
401408
"# your codes\n",
409+
"\n",
410+
"\n",
411+
"\n",
412+
"\n",
413+
"\n",
414+
"\n",
402415
"\n"
403416
]
404417
},
@@ -431,25 +444,26 @@
431444
" print(\"開始爬取 \", i+1, \" :\" , x)\n",
432445
" id = movie_id[x]\n",
433446
" response = requests.get(\"https://tw.movies.yahoo.com/movieinfo_review.html/id=\" + str(id))\n",
434-
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
447+
" soup = BeautifulSoup(response.text, \"lmxl\")\n",
435448
" if soup.find(\"div\", {\"class\":\"page_numbox\"}) != None:\n",
436449
" page = int(soup.find(\"div\", {\"class\":\"page_numbox\"}).find_all(\"a\")[-2].text)\n",
437450
" \n",
438451
" comment_all = []\n",
439452
" star_all = []\n",
440453
" comment_df = pd.DataFrame(columns = [\"movie\", \"comments\", \"star\"])\n",
441-
"\n",
454+
" \n",
455+
" movie_name = soup.find(\"div\", {\"class\":\"inform_title\"}).text\n",
456+
" \n",
442457
" for i in range(1, page):\n",
443-
" response = requests.get(\"https://tw.movies.yahoo.com/movieinfo_review.html/id=\" + id + \"?sort=create_ts&order=desc&page=\" + str(i) )\n",
444-
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
458+
" response = requests.get(\"https://tw.movies.yahoo.com/movieinfo_review.html/id=\" + id + \"?sort=update_ts&order=desc&page=\" + str(i) )\n",
459+
" soup = BeautifulSoup(response.text, \"lxml\")\n",
445460
"\n",
446-
" comment = [x.find(\"span\").text for x in soup.find_all(\"div\", {\"class\":\"usercom_inner _c\"})]\n",
461+
" comment = [x.find(\"span\", {\"class\":None}).text for x in soup.find_all(\"div\", {\"class\":\"usercom_inner _c\"})]\n",
447462
" comment_all.extend(comment)\n",
448463
"\n",
449464
" star = [comment.find(\"input\", {\"name\":\"score\"})['value'] for comment in soup.find_all(\"div\", {\"class\":\"usercom_inner _c\"})]\n",
450465
" star_all.extend(star)\n",
451466
"\n",
452-
" movie_name = soup.find(\"div\", {\"class\":\"inform_title\"}).text\n",
453467
" comment_df = pd.DataFrame({\"comments\":comment_all,\n",
454468
" \"movie\":movie_name,\n",
455469
" \"star\":star_all})\n",
@@ -460,7 +474,7 @@
460474
" star_all = []\n",
461475
" comment_df = pd.DataFrame(columns = [\"movie\", \"comments\", \"star\"])\n",
462476
" \n",
463-
" comment = [x.find(\"span\").text for x in soup.find_all(\"div\", {\"class\":\"usercom_inner _c\"})]\n",
477+
" comment = [x.find(\"span\", {\"class\":None}).text for x in soup.find_all(\"div\", {\"class\":\"usercom_inner _c\"})]\n",
464478
" comment_all.extend(comment)\n",
465479
"\n",
466480
" star = [comment.find(\"input\", {\"name\":\"score\"})['value'] for comment in soup.find_all(\"div\", {\"class\":\"usercom_inner _c\"})]\n",

0 commit comments

Comments
 (0)