diff --git a/.gitignore b/.gitignore index fe1f6e5..e62f729 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,28 @@ -venv +venv/ .idea example.py -parents.csv -children.csv -images.csv \ No newline at end of file +products.csv +variants.csv +images.csv + +**/__pycache__/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/README.md b/README.md index 242125b..2e94bb2 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,15 @@ To install ShopifyScraper, run the following command: pip3 install git+https://github.com/practical-data-science/ShopifyScraper.git ``` +#### Development +To install ShopifyScraper in development mode, run the following command: + +```bash +git clone https://github.com/practical-data-science/ShopifyScraper.git +cd ShopifyScraper +pip3 install -e . +``` + ### Usage ```python @@ -18,19 +27,21 @@ from shopify_scraper import scraper url = "https://yourshopifydomain.com" -parents = scraper.get_products(url) -parents.to_csv('parents.csv', index=False) -print('Parents: ', len(parents)) +products = scraper.get_products(url) +products.to_csv('products.csv', index=False) +print('Products: count=', len(products)) -children = scraper.get_variants(parents) -children.to_csv('children.csv', index=False) -print('Children: ', len(children)) +variants = scraper.get_variants(products) +variants.to_csv('variants.csv', index=False) +print('Variants: count=', len(variants)) -images = scraper.get_images(parents) +images = scraper.get_images(products) images.to_csv('images.csv', index=False) -print('Images: ', len(images)) - +print('Images: count=', len(images)) ``` +Note that variants has a many-to-one relationship with products, ON variant.product_id = product.id. + +Note that images has a many-to-one relationship with products, ON image.product_id = product.id. diff --git a/shopify_scraper/__init__.py b/shopify_scraper/__init__.py index e69de29..53a8196 100644 --- a/shopify_scraper/__init__.py +++ b/shopify_scraper/__init__.py @@ -0,0 +1,10 @@ + +__all__ = [ + 'get_json', + 'get_products', + 'get_variants', + 'get_images' +] + +# excluded from __all__: +# '_flatten_column_to_dataframe', '_products_json_to_df' diff --git a/shopify_scraper/scraper.py b/shopify_scraper/scraper.py index a2973ed..e27a68a 100644 --- a/shopify_scraper/scraper.py +++ b/shopify_scraper/scraper.py @@ -8,8 +8,7 @@ import pandas as pd import requests - -def get_json(url, page): +def get_json(url: str, page: int) -> str: """ Get Shopify products.json from a store URL. @@ -20,26 +19,12 @@ def get_json(url, page): products_json: Products.json from the store. """ - try: - response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5) - products_json = response.text - response.raise_for_status() - return products_json - - except requests.exceptions.HTTPError as error_http: - print("HTTP Error:", error_http) - - except requests.exceptions.ConnectionError as error_connection: - print("Connection Error:", error_connection) - - except requests.exceptions.Timeout as error_timeout: - print("Timeout Error:", error_timeout) + response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5) + products_json = response.text + response.raise_for_status() + return products_json - except requests.exceptions.RequestException as error: - print("Error: ", error) - - -def to_df(products_json): +def _products_json_to_df(products_json: str) -> pd.DataFrame: """ Convert products.json to a pandas DataFrame. @@ -49,15 +34,11 @@ def to_df(products_json): df: Pandas DataFrame of the products.json. """ - try: - products_dict = json.loads(products_json) - df = pd.DataFrame.from_dict(products_dict['products']) - return df - except Exception as e: - print(e) - + products_dict = json.loads(products_json) + df = pd.DataFrame.from_dict(products_dict['products']) + return df -def get_products(url): +def get_products(url: str) -> pd.DataFrame: """ Get all products from a store. @@ -71,7 +52,7 @@ def get_products(url): while results: products_json = get_json(url, page) - products_dict = to_df(products_json) + products_dict = _products_json_to_df(products_json) if len(products_dict) == 0: break @@ -82,9 +63,8 @@ def get_products(url): df['url'] = f"{url}/products/" + df['handle'] return df - -def get_variants(products): - """Get variants from a list of products. +def get_variants(products: pd.DataFrame) -> pd.DataFrame: + """Get variants from a table of products. Args: products (pd.DataFrame): Pandas dataframe of products from get_products() @@ -97,19 +77,17 @@ def get_variants(products): df_variants = pd.DataFrame() for row in products.itertuples(index='True'): - for variant in getattr(row, 'variants'): df_variants = pd.concat([df_variants, pd.DataFrame.from_records(variant, index=[0])]) df_variants['id'].astype(int) df_variants['product_id'].astype(int) - df_parent_data = products[['id', 'title', 'vendor']] - df_parent_data = df_parent_data.rename(columns={'title': 'parent_title', 'id': 'parent_id'}) - df_variants = df_variants.merge(df_parent_data, left_on='product_id', right_on='parent_id') + df_product_data = products[['id', 'title', 'vendor']] + df_product_data = df_product_data.rename(columns={'title': 'product_title', 'id': 'product_id', 'vendor': 'product_vendor'}) + df_variants = df_variants.merge(df_product_data, left_on='product_id', right_on='product_id', validate='m:1') return df_variants - -def json_list_to_df(df, col): +def _flatten_column_to_dataframe(df: pd.DataFrame, col: str) -> pd.DataFrame: """Return a Pandas dataframe based on a column that contains a list of JSON objects. Args: @@ -120,15 +98,10 @@ def json_list_to_df(df, col): Pandas dataframe: A new dataframe with the JSON objects expanded into columns. """ - rows = [] - for index, row in df[col].iteritems(): - for item in row: - rows.append(item) - df = pd.DataFrame(rows) - return df - + rows = [item for row in df[col] for item in row] + return pd.DataFrame(rows) -def get_images(df_products): +def get_images(df_products: pd.DataFrame) -> pd.DataFrame: """Get images from a list of products. Args: @@ -138,5 +111,4 @@ def get_images(df_products): images (pd.DataFrame): Pandas dataframe of images """ - return json_list_to_df(df_products, 'images') - + return _flatten_column_to_dataframe(df_products, 'images')