|
24476 | 24476 | }, |
24477 | 24477 | { |
24478 | 24478 | "cell_type": "code", |
24479 | | - "execution_count": 3, |
| 24479 | + "execution_count": null, |
24480 | 24480 | "id": "57fa11d8-3152-48db-bb74-4575bbeca7bb", |
24481 | 24481 | "metadata": { |
24482 | 24482 | "colab": { |
@@ -24627,216 +24627,7 @@ |
24627 | 24627 | "hide-output" |
24628 | 24628 | ] |
24629 | 24629 | }, |
24630 | | - "outputs": [ |
24631 | | - { |
24632 | | - "name": "stderr", |
24633 | | - "output_type": "stream", |
24634 | | - "text": [ |
24635 | | - "2024-07-30 19:52:24,499 - BERTopic - Embedding - Transforming documents to embeddings.\n" |
24636 | | - ] |
24637 | | - }, |
24638 | | - { |
24639 | | - "data": { |
24640 | | - "application/vnd.jupyter.widget-view+json": { |
24641 | | - "model_id": "8141eb80bf784dcaa7a721459e6009ba", |
24642 | | - "version_major": 2, |
24643 | | - "version_minor": 0 |
24644 | | - }, |
24645 | | - "text/plain": [ |
24646 | | - "modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]" |
24647 | | - ] |
24648 | | - }, |
24649 | | - "metadata": {}, |
24650 | | - "output_type": "display_data" |
24651 | | - }, |
24652 | | - { |
24653 | | - "data": { |
24654 | | - "application/vnd.jupyter.widget-view+json": { |
24655 | | - "model_id": "b2937413668d44d5a950c578c6455884", |
24656 | | - "version_major": 2, |
24657 | | - "version_minor": 0 |
24658 | | - }, |
24659 | | - "text/plain": [ |
24660 | | - "config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]" |
24661 | | - ] |
24662 | | - }, |
24663 | | - "metadata": {}, |
24664 | | - "output_type": "display_data" |
24665 | | - }, |
24666 | | - { |
24667 | | - "data": { |
24668 | | - "application/vnd.jupyter.widget-view+json": { |
24669 | | - "model_id": "25aeb49e460a47d1a966221a278c1aa9", |
24670 | | - "version_major": 2, |
24671 | | - "version_minor": 0 |
24672 | | - }, |
24673 | | - "text/plain": [ |
24674 | | - "README.md: 0%| | 0.00/10.7k [00:00<?, ?B/s]" |
24675 | | - ] |
24676 | | - }, |
24677 | | - "metadata": {}, |
24678 | | - "output_type": "display_data" |
24679 | | - }, |
24680 | | - { |
24681 | | - "data": { |
24682 | | - "application/vnd.jupyter.widget-view+json": { |
24683 | | - "model_id": "e3e26066cd75463299b647b72d4bf613", |
24684 | | - "version_major": 2, |
24685 | | - "version_minor": 0 |
24686 | | - }, |
24687 | | - "text/plain": [ |
24688 | | - "sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]" |
24689 | | - ] |
24690 | | - }, |
24691 | | - "metadata": {}, |
24692 | | - "output_type": "display_data" |
24693 | | - }, |
24694 | | - { |
24695 | | - "data": { |
24696 | | - "application/vnd.jupyter.widget-view+json": { |
24697 | | - "model_id": "79495920a49d462981e54feba77ba67c", |
24698 | | - "version_major": 2, |
24699 | | - "version_minor": 0 |
24700 | | - }, |
24701 | | - "text/plain": [ |
24702 | | - "config.json: 0%| | 0.00/612 [00:00<?, ?B/s]" |
24703 | | - ] |
24704 | | - }, |
24705 | | - "metadata": {}, |
24706 | | - "output_type": "display_data" |
24707 | | - }, |
24708 | | - { |
24709 | | - "data": { |
24710 | | - "application/vnd.jupyter.widget-view+json": { |
24711 | | - "model_id": "c8b0666447f74c699f1af04653b1a761", |
24712 | | - "version_major": 2, |
24713 | | - "version_minor": 0 |
24714 | | - }, |
24715 | | - "text/plain": [ |
24716 | | - "model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]" |
24717 | | - ] |
24718 | | - }, |
24719 | | - "metadata": {}, |
24720 | | - "output_type": "display_data" |
24721 | | - }, |
24722 | | - { |
24723 | | - "data": { |
24724 | | - "application/vnd.jupyter.widget-view+json": { |
24725 | | - "model_id": "43b105c5bbd542e78c6d50a3cc669cf3", |
24726 | | - "version_major": 2, |
24727 | | - "version_minor": 0 |
24728 | | - }, |
24729 | | - "text/plain": [ |
24730 | | - "tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]" |
24731 | | - ] |
24732 | | - }, |
24733 | | - "metadata": {}, |
24734 | | - "output_type": "display_data" |
24735 | | - }, |
24736 | | - { |
24737 | | - "data": { |
24738 | | - "application/vnd.jupyter.widget-view+json": { |
24739 | | - "model_id": "787a8a9f95da415ba25e643addfa9db4", |
24740 | | - "version_major": 2, |
24741 | | - "version_minor": 0 |
24742 | | - }, |
24743 | | - "text/plain": [ |
24744 | | - "vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]" |
24745 | | - ] |
24746 | | - }, |
24747 | | - "metadata": {}, |
24748 | | - "output_type": "display_data" |
24749 | | - }, |
24750 | | - { |
24751 | | - "data": { |
24752 | | - "application/vnd.jupyter.widget-view+json": { |
24753 | | - "model_id": "fef2e3140b4c419183ad0e150fa55cf8", |
24754 | | - "version_major": 2, |
24755 | | - "version_minor": 0 |
24756 | | - }, |
24757 | | - "text/plain": [ |
24758 | | - "tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]" |
24759 | | - ] |
24760 | | - }, |
24761 | | - "metadata": {}, |
24762 | | - "output_type": "display_data" |
24763 | | - }, |
24764 | | - { |
24765 | | - "data": { |
24766 | | - "application/vnd.jupyter.widget-view+json": { |
24767 | | - "model_id": "93fd6d668f444b34a706af424dee5d77", |
24768 | | - "version_major": 2, |
24769 | | - "version_minor": 0 |
24770 | | - }, |
24771 | | - "text/plain": [ |
24772 | | - "special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]" |
24773 | | - ] |
24774 | | - }, |
24775 | | - "metadata": {}, |
24776 | | - "output_type": "display_data" |
24777 | | - }, |
24778 | | - { |
24779 | | - "data": { |
24780 | | - "application/vnd.jupyter.widget-view+json": { |
24781 | | - "model_id": "73d5d8704d69476ca2ad98bae7c55907", |
24782 | | - "version_major": 2, |
24783 | | - "version_minor": 0 |
24784 | | - }, |
24785 | | - "text/plain": [ |
24786 | | - "1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]" |
24787 | | - ] |
24788 | | - }, |
24789 | | - "metadata": {}, |
24790 | | - "output_type": "display_data" |
24791 | | - }, |
24792 | | - { |
24793 | | - "data": { |
24794 | | - "application/vnd.jupyter.widget-view+json": { |
24795 | | - "model_id": "dcd0c1bdd79941f296f76d961846ebe2", |
24796 | | - "version_major": 2, |
24797 | | - "version_minor": 0 |
24798 | | - }, |
24799 | | - "text/plain": [ |
24800 | | - "Batches: 0%| | 0/589 [00:00<?, ?it/s]" |
24801 | | - ] |
24802 | | - }, |
24803 | | - "metadata": {}, |
24804 | | - "output_type": "display_data" |
24805 | | - }, |
24806 | | - { |
24807 | | - "name": "stderr", |
24808 | | - "output_type": "stream", |
24809 | | - "text": [ |
24810 | | - "2024-07-30 19:55:21,993 - BERTopic - Embedding - Completed ✓\n", |
24811 | | - "2024-07-30 19:55:21,996 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", |
24812 | | - "2024-07-30 19:55:35,484 - BERTopic - Dimensionality - Completed ✓\n", |
24813 | | - "2024-07-30 19:55:35,485 - BERTopic - Cluster - Start clustering the reduced embeddings\n", |
24814 | | - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24815 | | - "To disable this warning, you can either:\n", |
24816 | | - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24817 | | - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24818 | | - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24819 | | - "To disable this warning, you can either:\n", |
24820 | | - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24821 | | - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24822 | | - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24823 | | - "To disable this warning, you can either:\n", |
24824 | | - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24825 | | - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24826 | | - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24827 | | - "To disable this warning, you can either:\n", |
24828 | | - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24829 | | - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24830 | | - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", |
24831 | | - "To disable this warning, you can either:\n", |
24832 | | - "\t- Avoid using `tokenizers` before the fork if possible\n", |
24833 | | - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", |
24834 | | - "2024-07-30 19:55:37,980 - BERTopic - Cluster - Completed ✓\n", |
24835 | | - "2024-07-30 19:55:37,988 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", |
24836 | | - "2024-07-30 19:55:39,801 - BERTopic - Representation - Completed ✓\n" |
24837 | | - ] |
24838 | | - } |
24839 | | - ], |
| 24630 | + "outputs": [], |
24840 | 24631 | "source": [ |
24841 | 24632 | "from bertopic import BERTopic\n", |
24842 | 24633 | "\n", |
|
80847 | 80638 | "source": [ |
80848 | 80639 | "[Link to BertTopic](https://bit.ly/4fjwU9T)." |
80849 | 80640 | ] |
| 80641 | + }, |
| 80642 | + { |
| 80643 | + "cell_type": "markdown", |
| 80644 | + "id": "064743d9", |
| 80645 | + "metadata": {}, |
| 80646 | + "source": [ |
| 80647 | + "### Beyond Keywords: Building a Semantic Recipe Search Engine" |
| 80648 | + ] |
| 80649 | + }, |
| 80650 | + { |
| 80651 | + "cell_type": "markdown", |
| 80652 | + "id": "82694664", |
| 80653 | + "metadata": {}, |
| 80654 | + "source": [ |
| 80655 | + "Semantic search enables content discovery based on meaning rather than just keywords. This approach uses vector embeddings - numerical representations of text that capture semantic essence. \n", |
| 80656 | + "\n", |
| 80657 | + "By converting text to vector embeddings, we can quantify semantic similarity between different pieces of content in a high-dimensional vector space. This allows for comparison and search based on underlying meaning, surpassing simple keyword matching.\n", |
| 80658 | + "\n", |
| 80659 | + "Here's a Python implementation of semantic search for recipe recommendations using sentence-transformers:" |
| 80660 | + ] |
| 80661 | + }, |
| 80662 | + { |
| 80663 | + "cell_type": "code", |
| 80664 | + "execution_count": 9, |
| 80665 | + "id": "84497e32", |
| 80666 | + "metadata": {}, |
| 80667 | + "outputs": [ |
| 80668 | + { |
| 80669 | + "name": "stdout", |
| 80670 | + "output_type": "stream", |
| 80671 | + "text": [ |
| 80672 | + "Query: healthy dessert without sugar\n", |
| 80673 | + "Most similar recipes:\n", |
| 80674 | + "- No-Bake Berry Chia Seed Pudding (Similarity: 0.55)\n", |
| 80675 | + "- Banana and Date Sweetened Oatmeal Cookies (Similarity: 0.43)\n" |
| 80676 | + ] |
| 80677 | + } |
| 80678 | + ], |
| 80679 | + "source": [ |
| 80680 | + "from sentence_transformers import SentenceTransformer\n", |
| 80681 | + "from sklearn.metrics.pairwise import cosine_similarity\n", |
| 80682 | + "\n", |
| 80683 | + "# Step 1: Prepare our data\n", |
| 80684 | + "recipes = [\n", |
| 80685 | + " \"Banana and Date Sweetened Oatmeal Cookies\",\n", |
| 80686 | + " \"No-Bake Berry Chia Seed Pudding\",\n", |
| 80687 | + " \"Deep-Fried Oreo Sundae with Caramel Sauce\",\n", |
| 80688 | + " \"Loaded Bacon Cheeseburger Pizza\",\n", |
| 80689 | + "]\n", |
| 80690 | + "\n", |
| 80691 | + "# Step 2: Load a pre-trained model for creating embeddings\n", |
| 80692 | + "model = SentenceTransformer('all-MiniLM-L6-v2')\n", |
| 80693 | + "\n", |
| 80694 | + "# Step 3: Create embeddings for our recipe descriptions\n", |
| 80695 | + "recipe_embeddings = model.encode(recipes)\n", |
| 80696 | + "\n", |
| 80697 | + "# Step 4: Function to find similar recipes \n", |
| 80698 | + "def find_similar_recipes(query, top_k=2):\n", |
| 80699 | + " # Create embedding for the query\n", |
| 80700 | + " query_embedding = model.encode([query])\n", |
| 80701 | + " \n", |
| 80702 | + " # Calculate similarity\n", |
| 80703 | + " similarities = cosine_similarity(query_embedding, recipe_embeddings)[0]\n", |
| 80704 | + " \n", |
| 80705 | + " # Get top k similar recipes \n", |
| 80706 | + " top_indices = similarities.argsort()[-top_k:][::-1]\n", |
| 80707 | + " \n", |
| 80708 | + " return [(recipes[i], similarities[i]) for i in top_indices]\n", |
| 80709 | + "\n", |
| 80710 | + "# Step 5: Test our semantic search\n", |
| 80711 | + "query = \"healthy dessert without sugar\"\n", |
| 80712 | + "results = find_similar_recipes(query)\n", |
| 80713 | + "\n", |
| 80714 | + "print(f\"Query: {query}\")\n", |
| 80715 | + "print(\"Most similar recipes:\")\n", |
| 80716 | + "for recipe, score in results:\n", |
| 80717 | + " print(f\"- {recipe} (Similarity: {score:.2f})\")" |
| 80718 | + ] |
| 80719 | + }, |
| 80720 | + { |
| 80721 | + "cell_type": "markdown", |
| 80722 | + "id": "fe3a8c67", |
| 80723 | + "metadata": {}, |
| 80724 | + "source": [ |
| 80725 | + "This implementation successfully identifies healthier dessert options, understanding that ingredients like berries, chia seeds, bananas, and dates are often used in healthy, sugar-free desserts. It excludes clearly unhealthy options, demonstrating comprehension of \"healthy\" in the dessert context. The score difference (0.55 vs 0.43) indicates that the model considers the chia seed pudding a closer match to the concept of a healthy, sugar-free dessert than the oatmeal cookies." |
| 80726 | + ] |
80850 | 80727 | } |
80851 | 80728 | ], |
80852 | 80729 | "metadata": { |
|
0 commit comments