from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.common.by import By import pandas as pd import time def download_data(year,month): """ Parameters ---------- year : TYPE year as string month : TYPE numerical month as string e.g "6" or "11" Returns ------- None. """ driver = webdriver.Chrome() driver.get("https://www.customs.go.jp/toukei/srch/indexe.htm?M=01&P=0") WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"/html/frameset/frame"))) #frameset = driver.find_element(By.XPATH,"/html/frameset/frame") #driver.switch_to.frame(frameset) frame = driver.find_element(By.XPATH,"/html/frameset/frame[1]") driver.switch_to.frame(frame) driver.find_element(By.XPATH,"/html/body/form/div[1]/div[3]/div/div/p[2]/input[2]").click() dropdown_year = Select(driver.find_element(By.XPATH,"/html/body/form/div[1]/div[3]/div/div/p[3]/select[2]")) dropdown_year.select_by_value(year) dropdown_month = Select(driver.find_element(By.XPATH,"/html/body/form/div[1]/div[3]/div/div/p[3]/select[3]")) dropdown_month.select_by_value(month) dropdown_designation = Select(driver.find_element(By.XPATH,"/html/body/form/div[1]/div[3]/div/div/p[4]/select")) dropdown_designation.select_by_value("2") text_form = driver.find_element(By.XPATH,"/html/body/form/div[1]/div[3]/div/div/p[5]/input") # Replace 'text_input' with the actual ID of the text form # Clear the text field (optional) text_form.clear() # Write text into the text form custom_codes = "270900100 270900900 271019162 271019164 271019166 271019169 271019172 271019174 271019179" text_form.send_keys(custom_codes) submit_path = "//input[contains(@value,'Search')]" driver.find_element(By.XPATH,submit_path).click() frame = driver.find_element(By.XPATH,"/html/frameset/frame[1]") driver.switch_to.frame(frame) download_path = "//input[@name='dl']" driver.find_element(By.XPATH,download_path).click() driver.quit() def get_df(year,month): url = f"https://www.customs.go.jp/toukei/srch/indexe.htm?M=01&P=1,2,,,,,,,,1,0,{year},0,{month},0,2,270900100,270900900,271019162,271019164,271019166,271019169,271019172,271019174,271019179,,1,,,,,,,,,,,,,,,,,,,,,,50" driver = webdriver.Chrome() driver.get(url) WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"/html/frameset/frame"))) try: table_path = '/html/body/div[1]/div[3]/div/div/form/div' table = WebDriverWait(driver, 45).until(EC.presence_of_element_located(By.XPATH,table_path)) time.sleep(2) table = driver.find_element(By.XPATH,table_path) print("exception not hit") except: table_path = '/html/body/div[1]/div[3]/div/div/form/div/table[2]' time.sleep(5) table = driver.find_element(By.XPATH,table_path) print("exception hit") headers = [header.text.strip() for header in table.find_elements("tag name", "th")] # Extract the table rows rows = [] for row in table.find_elements("tag name", "tr")[1:]: row_data = [cell.text.strip() for cell in row.find_elements("tag name", "td")] rows.append(row_data) expected_headers = ['COUNTRY', 'UNIT1', 'UNIT2', 'CURRENT MONTH', 'CUMULATIVE YEAR TO DATE', 'QUANTITY1', 'QUANTITY2', 'VALUE', 'QUANTITY1', 'QUANTITY2', 'VALUE'] assert expected_headers == headers headers = ['COUNTRY', 'UNIT1', 'UNIT2', 'QUANTITY1', 'QUANTITY2', 'VALUE', 'CUMULATIVEQUANTITY1', 'CUMULATIVEQUANTITY2', 'CUMULATIVEVALUE'] driver.close() return pd.DataFrame(rows, columns=headers) def month_year_iter( start_month, start_year, end_month, end_year ): ym_start= 12*start_year + start_month - 1 ym_end= 12*end_year + end_month - 1 for ym in range( ym_start, ym_end ): y, m = divmod( ym, 12 ) yield y, m+1 year = "2017" month = "6" for year,month in month_year_iter(12,2023,4,2024): year = str(year) month = str(month) df = get_df(year,month) month = "0"* (2 - len(month)) + month df.to_csv(r'C:\Users\Priyesh\Downloads\result\\'+year+month+'.csv')