diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md
new file mode 100644
index 0000000000..ca31e3f7a5
--- /dev/null
+++ b/scripts/data_collector/br_index/README.md
@@ -0,0 +1,61 @@
+# iBOVESPA History Companies Collection
+
+## Requirements
+
+- Install the libs from the file `requirements.txt`
+
+ ```bash
+ pip install -r requirements.txt
+ ```
+- `requirements.txt` file was generated using python3.8
+
+## For the ibovespa (IBOV) index, we have:
+
+
+
+### Method `get_new_companies`
+
+#### Index start date
+
+- The ibovespa index started on 2 January 1968 ([wiki](https://en.wikipedia.org/wiki/%C3%8Dndice_Bovespa)). In order to use this start date in our `bench_start_date(self)` method, two conditions must be satisfied:
+ 1) APIs used to download brazilian stocks (B3) historical prices must keep track of such historic data since 2 January 1968
+
+ 2) Some website or API must provide, from that date, the historic index composition. In other words, the companies used to build the index .
+
+ As a consequence, the method `bench_start_date(self)` inside `collector.py` was implemented using `pd.Timestamp("2003-01-03")` due to two reasons
+
+ 1) The earliest ibov composition that have been found was from the first quarter of 2003. More informations about such composition can be seen on the sections below.
+
+ 2) Yahoo finance, one of the libraries used to download symbols historic prices, keeps track from this date forward.
+
+- Within the `get_new_companies` method, a logic was implemented to get, for each ibovespa component stock, the start date that yahoo finance keeps track of.
+
+#### Code Logic
+
+The code does a web scrapping into the B3's [website](https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br), which keeps track of the ibovespa stocks composition on the current day.
+
+Other approaches, such as `request` and `Beautiful Soup` could have been used. However, the website shows the table with the stocks with some delay, since it uses a script inside of it to obtain such compositions.
+Alternatively, `selenium` was used to download this stocks' composition in order to overcome this problem.
+
+Futhermore, the data downloaded from the selenium script was preprocessed so it could be saved into the `csv` format stablished by `scripts/data_collector/index.py`.
+
+
+
+### Method `get_changes`
+
+No suitable data source that keeps track of ibovespa's history stocks composition has been found. Except from this [repository](https://github.com/igor17400/IBOV-HCI) which provide such information have been used, however it only provides the data from the 1st quarter of 2003 to 3rd quarter of 2021.
+
+With that reference, the index's composition can be compared quarter by quarter and year by year and then generate a file that keeps track of which stocks have been removed and which have been added each quarter and year.
+
+
+
+### Collector Data
+
+```bash
+# parse instruments, using in qlib/instruments.
+python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments
+
+# parse new companies
+python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies
+```
+
diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py
new file mode 100644
index 0000000000..bbb012b5c9
--- /dev/null
+++ b/scripts/data_collector/br_index/collector.py
@@ -0,0 +1,277 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from functools import partial
+import sys
+from pathlib import Path
+import importlib
+import datetime
+
+import fire
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+
+from data_collector.index import IndexBase
+from data_collector.utils import get_instruments
+
+quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"}
+
+
+class IBOVIndex(IndexBase):
+
+ ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv"
+ years_4_month_periods = []
+
+ def __init__(
+ self,
+ index_name: str,
+ qlib_dir: [str, Path] = None,
+ freq: str = "day",
+ request_retry: int = 5,
+ retry_sleep: int = 3,
+ ):
+ super(IBOVIndex, self).__init__(
+ index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
+ )
+
+ self.today: datetime = datetime.date.today()
+ self.current_4_month_period = self.get_current_4_month_period(self.today.month)
+ self.year = str(self.today.year)
+ self.years_4_month_periods = self.get_four_month_period()
+
+ @property
+ def bench_start_date(self) -> pd.Timestamp:
+ """
+ The ibovespa index started on 2 January 1968 (wiki), however,
+ no suitable data source that keeps track of ibovespa's history
+ stocks composition has been found. Except from the repo indicated
+ in README. Which keeps track of such information starting from
+ the first quarter of 2003
+ """
+ return pd.Timestamp("2003-01-03")
+
+ def get_current_4_month_period(self, current_month: int):
+ """
+ This function is used to calculated what is the current
+ four month period for the current month. For example,
+ If the current month is August 8, its four month period
+ is 2Q.
+
+ OBS: In english Q is used to represent *quarter*
+ which means a three month period. However, in
+ portuguese we use Q to represent a four month period.
+ In other words,
+
+ Jan, Feb, Mar, Apr: 1Q
+ May, Jun, Jul, Aug: 2Q
+ Sep, Oct, Nov, Dez: 3Q
+
+ Parameters
+ ----------
+ month : int
+ Current month (1 <= month <= 12)
+
+ Returns
+ -------
+ current_4m_period:str
+ Current Four Month Period (1Q or 2Q or 3Q)
+ """
+ if current_month < 5:
+ return "1Q"
+ if current_month < 9:
+ return "2Q"
+ if current_month <= 12:
+ return "3Q"
+ else:
+ return -1
+
+ def get_four_month_period(self):
+ """
+ The ibovespa index is updated every four months.
+ Therefore, we will represent each time period as 2003_1Q
+ which means 2003 first four mount period (Jan, Feb, Mar, Apr)
+ """
+ four_months_period = ["1Q", "2Q", "3Q"]
+ init_year = 2003
+ now = datetime.datetime.now()
+ current_year = now.year
+ current_month = now.month
+ for year in [item for item in range(init_year, current_year)]:
+ for el in four_months_period:
+ self.years_4_month_periods.append(str(year)+"_"+el)
+ # For current year the logic must be a little different
+ current_4_month_period = self.get_current_4_month_period(current_month)
+ for i in range(int(current_4_month_period[0])):
+ self.years_4_month_periods.append(str(current_year) + "_" + str(i+1) + "Q")
+ return self.years_4_month_periods
+
+
+ def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
+ """formatting the datetime in an instrument
+
+ Parameters
+ ----------
+ inst_df: pd.DataFrame
+ inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
+
+ Returns
+ -------
+ inst_df: pd.DataFrame
+
+ """
+ logger.info("Formatting Datetime")
+ if self.freq != "day":
+ inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
+ lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
+ )
+ else:
+ inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
+ lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d")
+ )
+
+ inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
+ lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d")
+ )
+ return inst_df
+
+ def format_quarter(self, cell: str):
+ """
+ Parameters
+ ----------
+ cell: str
+ It must be on the format 2003_1Q --> years_4_month_periods
+
+ Returns
+ ----------
+ date: str
+ Returns date in format 2003-03-01
+ """
+ cell_split = cell.split("_")
+ return cell_split[0] + "-" + quarter_dict[cell_split[1]]
+
+ def get_changes(self):
+ """
+ Access the index historic composition and compare it quarter
+ by quarter and year by year in order to generate a file that
+ keeps track of which stocks have been removed and which have
+ been added.
+
+ The Dataframe used as reference will provided the index
+ composition for each year an quarter:
+ pd.DataFrame:
+ symbol
+ SH600000
+ SH600001
+ .
+ .
+ .
+
+ Parameters
+ ----------
+ self: is used to represent the instance of the class.
+
+ Returns
+ ----------
+ pd.DataFrame:
+ symbol date type
+ SH600000 2019-11-11 add
+ SH600001 2020-11-10 remove
+ dtypes:
+ symbol: str
+ date: pd.Timestamp
+ type: str, value from ["add", "remove"]
+ """
+ logger.info("Getting companies changes in {} index ...".format(self.index_name))
+
+ try:
+ df_changes_list = []
+ for i in tqdm(range(len(self.years_4_month_periods) - 1)):
+ df = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i]), on_bad_lines="skip")["symbol"]
+ df_ = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i + 1]), on_bad_lines="skip")["symbol"]
+
+ ## Remove Dataframe
+ remove_date = self.years_4_month_periods[i].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i].split("_")[1]]
+ list_remove = list(df[~df.isin(df_)])
+ df_removed = pd.DataFrame(
+ {
+ "date": len(list_remove) * [remove_date],
+ "type": len(list_remove) * ["remove"],
+ "symbol": list_remove,
+ }
+ )
+
+ ## Add Dataframe
+ add_date = self.years_4_month_periods[i + 1].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i + 1].split("_")[1]]
+ list_add = list(df_[~df_.isin(df)])
+ df_added = pd.DataFrame(
+ {"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add}
+ )
+
+ df_changes_list.append(pd.concat([df_added, df_removed], sort=False))
+ df = pd.concat(df_changes_list).reset_index(drop=True)
+ df["symbol"] = df["symbol"].astype(str) + ".SA"
+
+ return df
+
+ except Exception as E:
+ logger.error("An error occured while downloading 2008 index composition - {}".format(E))
+
+ def get_new_companies(self):
+ """
+ Get latest index composition.
+ The repo indicated on README has implemented a script
+ to get the latest index composition from B3 website using
+ selenium. Therefore, this method will download the file
+ containing such composition
+
+ Parameters
+ ----------
+ self: is used to represent the instance of the class.
+
+ Returns
+ ----------
+ pd.DataFrame:
+ symbol start_date end_date
+ RRRP3 2020-11-13 2022-03-02
+ ALPA4 2008-01-02 2022-03-02
+ dtypes:
+ symbol: str
+ start_date: pd.Timestamp
+ end_date: pd.Timestamp
+ """
+ logger.info("Getting new companies in {} index ...".format(self.index_name))
+
+ try:
+ ## Get index composition
+
+ df_index = pd.read_csv(
+ self.ibov_index_composition.format(self.year + "_" + self.current_4_month_period), on_bad_lines="skip"
+ )
+ df_date_first_added = pd.read_csv(
+ self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.current_4_month_period),
+ on_bad_lines="skip",
+ )
+ df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]]
+ df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter)
+
+ # end_date will be our current quarter + 1, since the IBOV index updates itself every quarter
+ df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[self.current_4_month_period]
+ df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]]
+ df["symbol"] = df["symbol"].astype(str) + ".SA"
+
+ return df
+
+ except Exception as E:
+ logger.error("An error occured while getting new companies - {}".format(E))
+
+ def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+ if "Código" in df.columns:
+ return df.loc[:, ["Código"]].copy()
+
+
+
+if __name__ == "__main__":
+ fire.Fire(partial(get_instruments, market_index="br_index" ))
diff --git a/scripts/data_collector/br_index/requirements.txt b/scripts/data_collector/br_index/requirements.txt
new file mode 100644
index 0000000000..c77e932879
--- /dev/null
+++ b/scripts/data_collector/br_index/requirements.txt
@@ -0,0 +1,34 @@
+async-generator==1.10
+attrs==21.4.0
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.12
+cryptography==36.0.1
+fire==0.4.0
+h11==0.13.0
+idna==3.3
+loguru==0.6.0
+lxml==4.8.0
+multitasking==0.0.10
+numpy==1.22.2
+outcome==1.1.0
+pandas==1.4.1
+pycoingecko==2.2.0
+pycparser==2.21
+pyOpenSSL==22.0.0
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2021.3
+requests==2.27.1
+requests-futures==1.0.0
+six==1.16.0
+sniffio==1.2.0
+sortedcontainers==2.4.0
+termcolor==1.1.0
+tqdm==4.63.0
+trio==0.20.0
+trio-websocket==0.9.2
+urllib3==1.26.8
+wget==3.2
+wsproto==1.1.0
+yahooquery==2.2.15
diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py
index e5970c256d..0fdfc658b4 100644
--- a/scripts/data_collector/cn_index/collector.py
+++ b/scripts/data_collector/cn_index/collector.py
@@ -21,6 +21,7 @@
from data_collector.index import IndexBase
from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry
+from data_collector.utils import get_instruments
NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
@@ -315,7 +316,7 @@ def get_new_companies(self) -> pd.DataFrame:
return df
-class CSI300(CSIIndex):
+class CSI300Index(CSIIndex):
@property
def index_code(self):
return "000300"
@@ -458,46 +459,5 @@ def get_new_companies(self) -> pd.DataFrame:
return df
-def get_instruments(
- qlib_dir: str,
- index_name: str,
- method: str = "parse_instruments",
- freq: str = "day",
- request_retry: int = 5,
- retry_sleep: int = 3,
-):
- """
-
- Parameters
- ----------
- qlib_dir: str
- qlib data dir, default "Path(__file__).parent/qlib_data"
- index_name: str
- index name, value from ["csi100", "csi300"]
- method: str
- method, value from ["parse_instruments", "save_new_companies"]
- freq: str
- freq, value from ["day", "1min"]
- request_retry: int
- request retry, by default 5
- retry_sleep: int
- request sleep, by default 3
-
- Examples
- -------
- # parse instruments
- $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
-
- # parse new companies
- $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
-
- """
- _cur_module = importlib.import_module("data_collector.cn_index.collector")
- obj = getattr(_cur_module, f"{index_name.upper()}")(
- qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
- )
- getattr(obj, method)()
-
-
if __name__ == "__main__":
fire.Fire(get_instruments)
diff --git a/scripts/data_collector/index.py b/scripts/data_collector/index.py
index 497c199482..a23614b413 100644
--- a/scripts/data_collector/index.py
+++ b/scripts/data_collector/index.py
@@ -19,7 +19,7 @@ class IndexBase:
SYMBOL_FIELD_NAME = "symbol"
DATE_FIELD_NAME = "date"
START_DATE_FIELD = "start_date"
- END_DATE_FIELD = "end_ate"
+ END_DATE_FIELD = "end_date"
CHANGE_TYPE_FIELD = "type"
INSTRUMENTS_COLUMNS = [SYMBOL_FIELD_NAME, START_DATE_FIELD, END_DATE_FIELD]
REMOVE = "remove"
diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py
index 576b3c32ae..06c48f8f62 100644
--- a/scripts/data_collector/us_index/collector.py
+++ b/scripts/data_collector/us_index/collector.py
@@ -2,6 +2,7 @@
# Licensed under the MIT License.
import abc
+from functools import partial
import sys
import importlib
from pathlib import Path
@@ -20,6 +21,7 @@
from data_collector.index import IndexBase
from data_collector.utils import deco_retry, get_calendar_list, get_trading_date_by_shift
+from data_collector.utils import get_instruments
WIKI_URL = "https://en.wikipedia.org/wiki"
@@ -269,46 +271,6 @@ def parse_instruments(self):
logger.warning(f"No suitable data source has been found!")
-def get_instruments(
- qlib_dir: str,
- index_name: str,
- method: str = "parse_instruments",
- freq: str = "day",
- request_retry: int = 5,
- retry_sleep: int = 3,
-):
- """
-
- Parameters
- ----------
- qlib_dir: str
- qlib data dir, default "Path(__file__).parent/qlib_data"
- index_name: str
- index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"]
- method: str
- method, value from ["parse_instruments", "save_new_companies"]
- freq: str
- freq, value from ["day", "1min"]
- request_retry: int
- request retry, by default 5
- retry_sleep: int
- request sleep, by default 3
-
- Examples
- -------
- # parse instruments
- $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments
-
- # parse new companies
- $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies
-
- """
- _cur_module = importlib.import_module("data_collector.us_index.collector")
- obj = getattr(_cur_module, f"{index_name.upper()}Index")(
- qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
- )
- getattr(obj, method)()
-
if __name__ == "__main__":
- fire.Fire(get_instruments)
+ fire.Fire(partial(get_instruments, market_index="us_index"))
diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py
index 1814b75eae..7ef1cdf959 100644
--- a/scripts/data_collector/utils.py
+++ b/scripts/data_collector/utils.py
@@ -2,6 +2,7 @@
# Licensed under the MIT License.
import re
+import importlib
import time
import bisect
import pickle
@@ -19,6 +20,7 @@
from tqdm import tqdm
from functools import partial
from concurrent.futures import ProcessPoolExecutor
+from bs4 import BeautifulSoup
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
@@ -34,6 +36,7 @@
# NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks
"US_ALL": "^GSPC",
"IN_ALL": "^NSEI",
+ "BR_ALL": "^BVSP",
}
_BENCH_CALENDAR_LIST = None
@@ -41,6 +44,7 @@
_HS_SYMBOLS = None
_US_SYMBOLS = None
_IN_SYMBOLS = None
+_BR_SYMBOLS = None
_EN_FUND_SYMBOLS = None
_CALENDAR_MAP = {}
@@ -69,7 +73,9 @@ def _get_calendar(url):
calendar = _CALENDAR_MAP.get(bench_code, None)
if calendar is None:
- if bench_code.startswith("US_") or bench_code.startswith("IN_"):
+ if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):
+ print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]))
+ print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
else:
@@ -345,6 +351,57 @@ def _format(s_):
return _IN_SYMBOLS
+def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
+ """get Brazil(B3) stock symbols
+
+ Returns
+ -------
+ B3 stock symbols
+ """
+ global _BR_SYMBOLS
+
+ @deco_retry
+ def _get_ibovespa():
+ _symbols = []
+ url = "https://www.fundamentus.com.br/detalhes.php?papel="
+
+ # Request
+ agent = {"User-Agent": "Mozilla/5.0"}
+ page = requests.get(url, headers=agent)
+
+ # BeautifulSoup
+ soup = BeautifulSoup(page.content, "html.parser")
+ tbody = soup.find("tbody")
+
+ children = tbody.findChildren("a", recursive=True)
+ for child in children:
+ _symbols.append(str(child).split('"')[-1].split(">")[1].split("<")[0])
+
+ return _symbols
+
+ if _BR_SYMBOLS is None:
+ _all_symbols = _get_ibovespa()
+ if qlib_data_path is not None:
+ for _index in ["ibov"]:
+ ins_df = pd.read_csv(
+ Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
+ sep="\t",
+ names=["symbol", "start_date", "end_date"],
+ )
+ _all_symbols += ins_df["symbol"].unique().tolist()
+
+ def _format(s_):
+ s_ = s_.strip()
+ s_ = s_.strip("$")
+ s_ = s_.strip("*")
+ s_ = s_ + ".SA"
+ return s_
+
+ _BR_SYMBOLS = sorted(set(map(_format, _all_symbols)))
+
+ return _BR_SYMBOLS
+
+
def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get en fund symbols
@@ -502,6 +559,50 @@ def generate_minutes_calendar_from_daily(
return pd.Index(sorted(set(np.hstack(res))))
+def get_instruments(
+ qlib_dir: str,
+ index_name: str,
+ method: str = "parse_instruments",
+ freq: str = "day",
+ request_retry: int = 5,
+ retry_sleep: int = 3,
+ market_index: str = "cn_index"
+):
+ """
+
+ Parameters
+ ----------
+ qlib_dir: str
+ qlib data dir, default "Path(__file__).parent/qlib_data"
+ index_name: str
+ index name, value from ["csi100", "csi300"]
+ method: str
+ method, value from ["parse_instruments", "save_new_companies"]
+ freq: str
+ freq, value from ["day", "1min"]
+ request_retry: int
+ request retry, by default 5
+ retry_sleep: int
+ request sleep, by default 3
+ market_index: str
+ Where the files to obtain the index are located,
+ for example data_collector.cn_index.collector
+
+ Examples
+ -------
+ # parse instruments
+ $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
+
+ # parse new companies
+ $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
+
+ """
+ _cur_module = importlib.import_module("data_collector.{}.collector".format(market_index))
+ obj = getattr(_cur_module, f"{index_name.upper()}Index")(
+ qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
+ )
+ getattr(obj, method)()
+
if __name__ == "__main__":
- assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM
+ assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM
\ No newline at end of file
diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
index 71f2b75f8e..3ce9bae7f6 100644
--- a/scripts/data_collector/yahoo/README.md
+++ b/scripts/data_collector/yahoo/README.md
@@ -66,7 +66,7 @@ pip install -r requirements.txt
- `source_dir`: save the directory
- `interval`: `1d` or `1min`, by default `1d`
> **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`**
- - `region`: `CN` or `US` or `IN`, by default `CN`
+ - `region`: `CN` or `US` or `IN` or `BR`, by default `CN`
- `delay`: `time.sleep(delay)`, by default *0.5*
- `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)*
- `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)*
@@ -80,14 +80,21 @@ pip install -r requirements.txt
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region CN
# cn 1min data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --delay 1 --interval 1min --region CN
+
# us 1d data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US
# us 1min data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data_1min --delay 1 --interval 1min --region US
+
# in 1d data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region IN
# in 1min data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data_1min --delay 1 --interval 1min --region IN
+
+ # br 1d data
+ python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR
+ # br 1min data
+ python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR
```
2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data`
@@ -116,8 +123,15 @@ pip install -r requirements.txt
```bash
# normalize 1d cn
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_data --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
+
# normalize 1min cn
python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
+
+ # normalize 1d br
+ python scripts/data_collector/yahoo/collector.py normalize_data --source_dir ~/.qlib/stock_data/source/br_data --normalize_dir ~/.qlib/stock_data/source/br_1d_nor --region BR --interval 1d
+
+ # normalize 1min br
+ python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min
```
3. dump data: `python scripts/dump_bin.py dump_all`
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index e99a30d2a6..d57a3057b8 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -2,6 +2,7 @@
# Licensed under the MIT License.
import abc
+from re import I
import sys
import copy
import time
@@ -35,6 +36,7 @@
get_hs_stock_symbols,
get_us_stock_symbols,
get_in_stock_symbols,
+ get_br_stock_symbols,
generate_minutes_calendar_from_daily,
)
@@ -42,6 +44,8 @@
class YahooCollector(BaseCollector):
+ retry = 5 # Configuration attribute. How many times will it try to re-request the data if the network fails.
+
def __init__(
self,
save_dir: [str, Path],
@@ -146,7 +150,7 @@ def _show_logging_func():
def get_data(
self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
) -> pd.DataFrame:
- @deco_retry(retry_sleep=self.delay)
+ @deco_retry(retry_sleep=self.delay, retry=self.retry)
def _get_simple(start_, end_):
self.sleep()
_remote_interval = "1m" if interval == self.INTERVAL_1min else interval
@@ -311,6 +315,55 @@ class YahooCollectorIN1min(YahooCollectorIN):
pass
+class YahooCollectorBR(YahooCollector, ABC):
+ def retry(cls):
+ """"
+ The reason to use retry=2 is due to the fact that
+ Yahoo Finance unfortunately does not keep track of some
+ Brazilian stocks.
+
+ Therefore, the decorator deco_retry with retry argument
+ set to 5 will keep trying to get the stock data up to 5 times,
+ which makes the code to download Brazilians stocks very slow.
+
+ In future, this may change, but for now
+ I suggest to leave retry argument to 1 or 2 in
+ order to improve download speed.
+
+ To achieve this goal an abstract attribute (retry)
+ was added into YahooCollectorBR base class
+ """
+ raise NotImplementedError
+
+ def get_instrument_list(self):
+ logger.info("get BR stock symbols......")
+ symbols = get_br_stock_symbols() + [
+ "^BVSP",
+ ]
+ logger.info(f"get {len(symbols)} symbols.")
+ return symbols
+
+ def download_index_data(self):
+ pass
+
+ def normalize_symbol(self, symbol):
+ return code_to_fname(symbol).upper()
+
+ @property
+ def _timezone(self):
+ return "Brazil/East"
+
+
+class YahooCollectorBR1d(YahooCollectorBR):
+ retry = 2
+ pass
+
+
+class YahooCollectorBR1min(YahooCollectorBR):
+ retry = 2
+ pass
+
+
class YahooNormalize(BaseNormalize):
COLUMNS = ["open", "close", "high", "low", "volume"]
DAILY_FORMAT = "%Y-%m-%d"
@@ -833,6 +886,29 @@ def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
return get_calendar_list("ALL")
+class YahooNormalizeBR:
+ def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
+ return get_calendar_list("BR_ALL")
+
+
+class YahooNormalizeBR1d(YahooNormalizeBR, YahooNormalize1d):
+ pass
+
+
+class YahooNormalizeBR1min(YahooNormalizeBR, YahooNormalize1minOffline):
+ CALC_PAUSED_NUM = False
+
+ def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
+ # TODO: support 1min
+ raise ValueError("Does not support 1min")
+
+ def _get_1d_calendar_list(self):
+ return get_calendar_list("BR_ALL")
+
+ def symbol_to_yahoo(self, symbol):
+ return fname_to_code(symbol)
+
+
class Run(BaseRun):
def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
"""
@@ -848,7 +924,7 @@ def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval=
interval: str
freq, value from [1min, 1d], default 1d
region: str
- region, value from ["CN", "US"], default "CN"
+ region, value from ["CN", "US", "BR"], default "CN"
"""
super().__init__(source_dir, normalize_dir, max_workers, interval)
self.region = region
diff --git a/scripts/data_collector/yahoo/requirements.txt b/scripts/data_collector/yahoo/requirements.txt
index 61422c7ab6..1a58eda1f6 100644
--- a/scripts/data_collector/yahoo/requirements.txt
+++ b/scripts/data_collector/yahoo/requirements.txt
@@ -7,3 +7,6 @@ tqdm
lxml
yahooquery
joblib
+beautifulsoup4
+bs4
+soupsieve
\ No newline at end of file