From 1400d118a65e096325e4a3df0caf19bcdb48abc9 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:35:22 -0700 Subject: [PATCH 1/9] Add __all__ --- shopify_scraper/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/shopify_scraper/__init__.py b/shopify_scraper/__init__.py index e69de29..178c248 100644 --- a/shopify_scraper/__init__.py +++ b/shopify_scraper/__init__.py @@ -0,0 +1,10 @@ + +__all__ = [ + 'get_json', + 'get_products', + 'get_variants', + 'get_images' +] + +# excluded from __all__: +# 'json_list_to_df', 'to_df' From 4bfcae351c550ba2c3f7e97016589a44a5a9a44f Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:37:43 -0700 Subject: [PATCH 2/9] Add type hinting to functions --- shopify_scraper/scraper.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/shopify_scraper/scraper.py b/shopify_scraper/scraper.py index a2973ed..bd433fe 100644 --- a/shopify_scraper/scraper.py +++ b/shopify_scraper/scraper.py @@ -8,8 +8,7 @@ import pandas as pd import requests - -def get_json(url, page): +def get_json(url: str, page: int) -> str: """ Get Shopify products.json from a store URL. @@ -38,8 +37,7 @@ def get_json(url, page): except requests.exceptions.RequestException as error: print("Error: ", error) - -def to_df(products_json): +def to_df(products_json: str) -> pd.DataFrame: """ Convert products.json to a pandas DataFrame. @@ -56,8 +54,7 @@ def to_df(products_json): except Exception as e: print(e) - -def get_products(url): +def get_products(url: str) -> pd.DataFrame: """ Get all products from a store. @@ -82,9 +79,8 @@ def get_products(url): df['url'] = f"{url}/products/" + df['handle'] return df - -def get_variants(products): - """Get variants from a list of products. +def get_variants(products: pd.DataFrame) -> pd.DataFrame: + """Get variants from a table of products. Args: products (pd.DataFrame): Pandas dataframe of products from get_products() @@ -108,8 +104,7 @@ def get_variants(products): df_variants = df_variants.merge(df_parent_data, left_on='product_id', right_on='parent_id') return df_variants - -def json_list_to_df(df, col): +def json_list_to_df(df: pd.DataFrame, col: str) -> pd.DataFrame: """Return a Pandas dataframe based on a column that contains a list of JSON objects. Args: @@ -121,14 +116,13 @@ def json_list_to_df(df, col): """ rows = [] - for index, row in df[col].iteritems(): + for index, row in df[col].iteritems(): # FIXME: support removed in Pandas 2 for item in row: rows.append(item) df = pd.DataFrame(rows) return df - -def get_images(df_products): +def get_images(df_products: pd.DataFrame) -> pd.DataFrame: """Get images from a list of products. Args: @@ -139,4 +133,3 @@ def get_images(df_products): """ return json_list_to_df(df_products, 'images') - From c91798d76a26b09fab080ef4d430094387dc6943 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:38:43 -0700 Subject: [PATCH 3/9] Remove useless/dangerous premature error handling --- shopify_scraper/scraper.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/shopify_scraper/scraper.py b/shopify_scraper/scraper.py index bd433fe..9e2b252 100644 --- a/shopify_scraper/scraper.py +++ b/shopify_scraper/scraper.py @@ -19,23 +19,10 @@ def get_json(url: str, page: int) -> str: products_json: Products.json from the store. """ - try: - response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5) - products_json = response.text - response.raise_for_status() - return products_json - - except requests.exceptions.HTTPError as error_http: - print("HTTP Error:", error_http) - - except requests.exceptions.ConnectionError as error_connection: - print("Connection Error:", error_connection) - - except requests.exceptions.Timeout as error_timeout: - print("Timeout Error:", error_timeout) - - except requests.exceptions.RequestException as error: - print("Error: ", error) + response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5) + products_json = response.text + response.raise_for_status() + return products_json def to_df(products_json: str) -> pd.DataFrame: """ @@ -47,12 +34,9 @@ def to_df(products_json: str) -> pd.DataFrame: df: Pandas DataFrame of the products.json. """ - try: - products_dict = json.loads(products_json) - df = pd.DataFrame.from_dict(products_dict['products']) - return df - except Exception as e: - print(e) + products_dict = json.loads(products_json) + df = pd.DataFrame.from_dict(products_dict['products']) + return df def get_products(url: str) -> pd.DataFrame: """ From 29381c8a3b5e975172d51c5119c6c8f512c70708 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:44:15 -0700 Subject: [PATCH 4/9] Rename parent->product, and children->variants * Rename objects (parent->product, and children->variants) * Add comment about relationships --- .gitignore | 4 ++-- README.md | 20 +++++++++++--------- shopify_scraper/scraper.py | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index fe1f6e5..107683b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ venv .idea example.py -parents.csv -children.csv +products.csv +variants.csv images.csv \ No newline at end of file diff --git a/README.md b/README.md index 242125b..10ac27c 100644 --- a/README.md +++ b/README.md @@ -18,19 +18,21 @@ from shopify_scraper import scraper url = "https://yourshopifydomain.com" -parents = scraper.get_products(url) -parents.to_csv('parents.csv', index=False) -print('Parents: ', len(parents)) +products = scraper.get_products(url) +products.to_csv('products.csv', index=False) +print('Products: count=', len(products)) -children = scraper.get_variants(parents) -children.to_csv('children.csv', index=False) -print('Children: ', len(children)) +variants = scraper.get_variants(products) +variants.to_csv('variants.csv', index=False) +print('Variants: count=', len(variants)) -images = scraper.get_images(parents) +images = scraper.get_images(products) images.to_csv('images.csv', index=False) -print('Images: ', len(images)) - +print('Images: count=', len(images)) ``` +Note that variants has a many-to-one relationship with products, ON variant.product_id = product.id. + +Note that images has a many-to-one relationship with products, ON image.product_id = product.id. diff --git a/shopify_scraper/scraper.py b/shopify_scraper/scraper.py index 9e2b252..5b7ee0c 100644 --- a/shopify_scraper/scraper.py +++ b/shopify_scraper/scraper.py @@ -83,9 +83,9 @@ def get_variants(products: pd.DataFrame) -> pd.DataFrame: df_variants['id'].astype(int) df_variants['product_id'].astype(int) - df_parent_data = products[['id', 'title', 'vendor']] - df_parent_data = df_parent_data.rename(columns={'title': 'parent_title', 'id': 'parent_id'}) - df_variants = df_variants.merge(df_parent_data, left_on='product_id', right_on='parent_id') + df_product_data = products[['id', 'title', 'vendor']] + df_product_data = df_product_data.rename(columns={'title': 'product_title', 'id': 'product_id'}) + df_variants = df_variants.merge(df_product_data, left_on='product_id', right_on='product_id') return df_variants def json_list_to_df(df: pd.DataFrame, col: str) -> pd.DataFrame: From a93701f4aab2c5f0400df287971604c5e36aa2d2 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:46:11 -0700 Subject: [PATCH 5/9] Add note about development --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 10ac27c..2e94bb2 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,15 @@ To install ShopifyScraper, run the following command: pip3 install git+https://github.com/practical-data-science/ShopifyScraper.git ``` +#### Development +To install ShopifyScraper in development mode, run the following command: + +```bash +git clone https://github.com/practical-data-science/ShopifyScraper.git +cd ShopifyScraper +pip3 install -e . +``` + ### Usage ```python From 31290a10267b106e7d2aa4bc6e44d396844c3688 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:50:04 -0700 Subject: [PATCH 6/9] Add distribution artifacts to gitignore --- .gitignore | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 107683b..b6d7fab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,26 @@ -venv +venv/ .idea example.py products.csv variants.csv -images.csv \ No newline at end of file +images.csv + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST From 5455a2839ab9e6c5a0ba711eaa8c2c16e90a48f5 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:53:32 -0700 Subject: [PATCH 7/9] Add __pycache__ to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b6d7fab..e62f729 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ products.csv variants.csv images.csv +**/__pycache__/ + # Distribution / packaging .Python build/ From c1bf969bf9ff38e38ce232a4f4cda37741bba477 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 00:59:59 -0700 Subject: [PATCH 8/9] Add support for Pandas v2 * Rename `json_list_to_df` to `flatten_column_to_dataframe` * Re-implement `flatten_column_to_dataframe` to support Pandas v2 --- shopify_scraper/__init__.py | 2 +- shopify_scraper/scraper.py | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/shopify_scraper/__init__.py b/shopify_scraper/__init__.py index 178c248..15b60ae 100644 --- a/shopify_scraper/__init__.py +++ b/shopify_scraper/__init__.py @@ -7,4 +7,4 @@ ] # excluded from __all__: -# 'json_list_to_df', 'to_df' +# 'flatten_column_to_dataframe', 'to_df' diff --git a/shopify_scraper/scraper.py b/shopify_scraper/scraper.py index 5b7ee0c..5201088 100644 --- a/shopify_scraper/scraper.py +++ b/shopify_scraper/scraper.py @@ -77,7 +77,6 @@ def get_variants(products: pd.DataFrame) -> pd.DataFrame: df_variants = pd.DataFrame() for row in products.itertuples(index='True'): - for variant in getattr(row, 'variants'): df_variants = pd.concat([df_variants, pd.DataFrame.from_records(variant, index=[0])]) @@ -88,7 +87,7 @@ def get_variants(products: pd.DataFrame) -> pd.DataFrame: df_variants = df_variants.merge(df_product_data, left_on='product_id', right_on='product_id') return df_variants -def json_list_to_df(df: pd.DataFrame, col: str) -> pd.DataFrame: +def flatten_column_to_dataframe(df: pd.DataFrame, col: str) -> pd.DataFrame: """Return a Pandas dataframe based on a column that contains a list of JSON objects. Args: @@ -99,12 +98,8 @@ def json_list_to_df(df: pd.DataFrame, col: str) -> pd.DataFrame: Pandas dataframe: A new dataframe with the JSON objects expanded into columns. """ - rows = [] - for index, row in df[col].iteritems(): # FIXME: support removed in Pandas 2 - for item in row: - rows.append(item) - df = pd.DataFrame(rows) - return df + rows = [item for row in df[col] for item in row] + return pd.DataFrame(rows) def get_images(df_products: pd.DataFrame) -> pd.DataFrame: """Get images from a list of products. @@ -116,4 +111,4 @@ def get_images(df_products: pd.DataFrame) -> pd.DataFrame: images (pd.DataFrame): Pandas dataframe of images """ - return json_list_to_df(df_products, 'images') + return flatten_column_to_dataframe(df_products, 'images') From 804081d673b7f17f6497e10e10eb24f3724d8b29 Mon Sep 17 00:00:00 2001 From: DeflateAwning <11021263+DeflateAwning@users.noreply.github.com> Date: Mon, 13 Nov 2023 22:35:48 -0700 Subject: [PATCH 9/9] Improve semantics and general structure --- shopify_scraper/__init__.py | 2 +- shopify_scraper/scraper.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/shopify_scraper/__init__.py b/shopify_scraper/__init__.py index 15b60ae..53a8196 100644 --- a/shopify_scraper/__init__.py +++ b/shopify_scraper/__init__.py @@ -7,4 +7,4 @@ ] # excluded from __all__: -# 'flatten_column_to_dataframe', 'to_df' +# '_flatten_column_to_dataframe', '_products_json_to_df' diff --git a/shopify_scraper/scraper.py b/shopify_scraper/scraper.py index 5201088..e27a68a 100644 --- a/shopify_scraper/scraper.py +++ b/shopify_scraper/scraper.py @@ -24,7 +24,7 @@ def get_json(url: str, page: int) -> str: response.raise_for_status() return products_json -def to_df(products_json: str) -> pd.DataFrame: +def _products_json_to_df(products_json: str) -> pd.DataFrame: """ Convert products.json to a pandas DataFrame. @@ -52,7 +52,7 @@ def get_products(url: str) -> pd.DataFrame: while results: products_json = get_json(url, page) - products_dict = to_df(products_json) + products_dict = _products_json_to_df(products_json) if len(products_dict) == 0: break @@ -83,11 +83,11 @@ def get_variants(products: pd.DataFrame) -> pd.DataFrame: df_variants['id'].astype(int) df_variants['product_id'].astype(int) df_product_data = products[['id', 'title', 'vendor']] - df_product_data = df_product_data.rename(columns={'title': 'product_title', 'id': 'product_id'}) - df_variants = df_variants.merge(df_product_data, left_on='product_id', right_on='product_id') + df_product_data = df_product_data.rename(columns={'title': 'product_title', 'id': 'product_id', 'vendor': 'product_vendor'}) + df_variants = df_variants.merge(df_product_data, left_on='product_id', right_on='product_id', validate='m:1') return df_variants -def flatten_column_to_dataframe(df: pd.DataFrame, col: str) -> pd.DataFrame: +def _flatten_column_to_dataframe(df: pd.DataFrame, col: str) -> pd.DataFrame: """Return a Pandas dataframe based on a column that contains a list of JSON objects. Args: @@ -111,4 +111,4 @@ def get_images(df_products: pd.DataFrame) -> pd.DataFrame: images (pd.DataFrame): Pandas dataframe of images """ - return flatten_column_to_dataframe(df_products, 'images') + return _flatten_column_to_dataframe(df_products, 'images')