Source code for tensorlayerx.files.dataset_loaders.matt_mahoney_dataset

#! /usr/bin/python
# -*- coding: utf-8 -*-

import os
import zipfile

from tensorlayerx import logging
from tensorlayerx.files.utils import maybe_download_and_extract
logging.set_verbosity(logging.INFO)
__all__ = ['load_matt_mahoney_text8_dataset']


[docs]def load_matt_mahoney_text8_dataset(path='data'): """Load Matt Mahoney's dataset. Download a text file from Matt Mahoney's website if not present, and make sure it's the right size. Extract the first file enclosed in a zip file as a list of words. This dataset can be used for Word Embedding. Parameters ---------- path : str The path that the data is downloaded to, defaults is ``data/mm_test8/``. Returns -------- list of str The raw text data e.g. [.... 'their', 'families', 'who', 'were', 'expelled', 'from', 'jerusalem', ...] Examples -------- >>> words = tlx.files.load_matt_mahoney_text8_dataset() >>> print('Data size', len(words)) """ path = os.path.join(path, 'mm_test8') logging.info("If can't download this dataset automatically, " "please download it from the official website manually." "mm_test8 Dataset <http://mattmahoney.net/dc/>." "Please place dataset under 'data/mm_test8/' by default.") filename = 'text8.zip' url = 'http://mattmahoney.net/dc/' maybe_download_and_extract(filename, path, url, expected_bytes=31344016) with zipfile.ZipFile(os.path.join(path, filename)) as f: word_list = f.read(f.namelist()[0]).split() for idx, _ in enumerate(word_list): word_list[idx] = word_list[idx].decode() return word_list