Dataset之MNIST:MNIST(手写数字图片识别+ubyte.gz文件)数据集的下载(基于python语言根据爬虫技术自动下载MNIST数据集)
Dataset之MNIST:MNIST(手写数字图片识别+ubyte.gz文件)数据集的下载(基于python语言根据爬虫技术自动下载MNIST数据集)
数据集下载的所有代码
代码打包地址:mnist数据集下载的完整代码——mnist_download_main.rar
1、主文件 mnist_download_main.py文件
#1、读取数据集
# MNIST数据集大约12MB,如果没在指定的路径中找到就会自动下载。
from mnist import MNIST
data = MNIST(data_dir="data/MNIST/") #它由70,000张图像和对应的标签(图像的类别)组成。数据集分成三份互相独立的子集。本教程中只用训练集和测试集。
print("Size of:")
print("- Training-set:\t\t{}".format(data.num_train))
print("- Validation-set:\t{}".format(data.num_val))
print("- Test-set:\t\t{}".format(data.num_test))
2、mnist.py文件
########################################################################
#
# Downloads the MNIST data-set for recognizing hand-written digits.
#
# Implemented in Python 3.6
#
# Usage:
# 1) Create a new object instance: data = MNIST(data_dir="data/MNIST/")
# This automatically downloads the files to the given dir.
# 2) Use the training-set as data.x_train, data.y_train and data.y_train_cls
# 3) Get random batches of training data using data.random_batch()
# 4) Use the test-set as data.x_test, data.y_test and data.y_test_cls
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2016-18 by Magnus Erik Hvass Pedersen
#
########################################################################
import numpy as np
import gzip
import os
from dataset import one_hot_encoded
from download import download
########################################################################
# Base URL for downloading the data-files from the internet.
base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
# Filenames for the data-set.
filename_x_train = "train-images-idx3-ubyte.gz"
filename_y_train = "train-labels-idx1-ubyte.gz"
filename_x_test = "t10k-images-idx3-ubyte.gz"
filename_y_test = "t10k-labels-idx1-ubyte.gz"
########################################################################
class MNIST:
"""
The MNIST data-set for recognizing hand-written digits.
This automatically downloads the data-files if they do
not already exist in the local data_dir.
Note: Pixel-values are floats between 0.0 and 1.0.
"""
# The images are 28 pixels in each dimension.
img_size = 28
# The images are stored in one-dimensional arrays of this length.
img_size_flat = img_size * img_size
# Tuple with height and width of images used to reshape arrays.
img_shape = (img_size, img_size)
# Number of colour channels for the images: 1 channel for gray-scale.
num_channels = 1
# Tuple with height, width and depth used to reshape arrays.
# This is used for reshaping in Keras.
img_shape_full = (img_size, img_size, num_channels)
# Number of classes, one class for each of 10 digits.
num_classes = 10
def __init__(self, data_dir="data/MNIST/"):
"""
Load the MNIST data-set. Automatically downloads the files
if they do not already exist locally.
:param data_dir: Base-directory for downloading files.
"""
# Copy args to self.
self.data_dir = data_dir
# Number of images in each sub-set.
self.num_train = 55000
self.num_val = 5000
self.num_test = 10000
# Download / load the training-set.
x_train = self._load_images(filename=filename_x_train)
y_train_cls = self._load_cls(filename=filename_y_train)
# Split the training-set into train / validation.
# Pixel-values are converted from ints between 0 and 255
# to floats between 0.0 and 1.0.
self.x_train = x_train[0:self.num_train] / 255.0
self.x_val = x_train[self.num_train:] / 255.0
self.y_train_cls = y_train_cls[0:self.num_train]
self.y_val_cls = y_train_cls[self.num_train:]
# Download / load the test-set.
self.x_test = self._load_images(filename=filename_x_test) / 255.0
self.y_test_cls = self._load_cls(filename=filename_y_test)
# Convert the class-numbers from bytes to ints as that is needed
# some places in TensorFlow.
self.y_train_cls = self.y_train_cls.astype(np.int)
self.y_val_cls = self.y_val_cls.astype(np.int)
self.y_test_cls = self.y_test_cls.astype(np.int)
# Convert the integer class-numbers into one-hot encoded arrays.
self.y_train = one_hot_encoded(class_numbers=self.y_train_cls,
num_classes=self.num_classes)
self.y_val = one_hot_encoded(class_numbers=self.y_val_cls,
num_classes=self.num_classes)
self.y_test = one_hot_encoded(class_numbers=self.y_test_cls,
num_classes=self.num_classes)
def _load_data(self, filename, offset):
"""
Load the data in the given file. Automatically downloads the file
if it does not already exist in the data_dir.
:param filename: Name of the data-file.
:param offset: Start offset in bytes when reading the data-file.
:return: The data as a numpy array.
"""
# Download the file from the internet if it does not exist locally.
download(base_url=base_url, filename=filename, download_dir=self.data_dir)
# Read the data-file.
path = os.path.join(self.data_dir, filename)
with gzip.open(path, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=offset)
return data
def _load_images(self, filename):
"""
Load image-data from the given file.
Automatically downloads the file if it does not exist locally.
:param filename: Name of the data-file.
:return: Numpy array.
"""
# Read the data as one long array of bytes.
data = self._load_data(filename=filename, offset=16)
# Reshape to 2-dim array with shape (num_images, img_size_flat).
images_flat = data.reshape(-1, self.img_size_flat)
return images_flat
def _load_cls(self, filename):
"""
Load class-numbers from the given file.
Automatically downloads the file if it does not exist locally.
:param filename: Name of the data-file.
:return: Numpy array.
"""
return self._load_data(filename=filename, offset=8)
def random_batch(self, batch_size=32):
"""
Create a random batch of training-data.
:param batch_size: Number of images in the batch.
:return: 3 numpy arrays (x, y, y_cls)
"""
# Create a random index into the training-set.
idx = np.random.randint(low=0, high=self.num_train, size=batch_size)
# Use the index to lookup random training-data.
x_batch = self.x_train[idx]
y_batch = self.y_train[idx]
y_batch_cls = self.y_train_cls[idx]
return x_batch, y_batch, y_batch_cls
########################################################################
3、dataset.py文件
########################################################################
#
# Class for creating a data-set consisting of all files in a directory.
#
# Example usage is shown in the file knifey.py and Tutorial #09.
#
# Implemented in Python 3.5
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2016 by Magnus Erik Hvass Pedersen
#
########################################################################
import numpy as np
import os
import shutil
from cache import cache
########################################################################
def one_hot_encoded(class_numbers, num_classes=None):
"""
Generate the One-Hot encoded class-labels from an array of integers.
For example, if class_number=2 and num_classes=4 then
the one-hot encoded label is the float array: [0. 0. 1. 0.]
:param class_numbers:
Array of integers with class-numbers.
Assume the integers are from zero to num_classes-1 inclusive.
:param num_classes:
Number of classes. If None then use max(class_numbers)+1.
:return:
2-dim array of shape: [len(class_numbers), num_classes]
"""
# Find the number of classes if None is provided.
# Assumes the lowest class-number is zero.
if num_classes is None:
num_classes = np.max(class_numbers) + 1
return np.eye(num_classes, dtype=float)[class_numbers]
########################################################################
class DataSet:
def __init__(self, in_dir, exts='.jpg'):
"""
Create a data-set consisting of the filenames in the given directory
and sub-dirs that match the given filename-extensions.
For example, the knifey-spoony data-set (see knifey.py) has the
following dir-structure:
knifey-spoony/forky/
knifey-spoony/knifey/
knifey-spoony/spoony/
knifey-spoony/forky/test/
knifey-spoony/knifey/test/
knifey-spoony/spoony/test/
This means there are 3 classes called: forky, knifey, and spoony.
If we set in_dir = "knifey-spoony/" and create a new DataSet-object
then it will scan through these directories and create a training-set
and test-set for each of these classes.
The training-set will contain a list of all the *.jpg filenames
in the following directories:
knifey-spoony/forky/
knifey-spoony/knifey/
knifey-spoony/spoony/
The test-set will contain a list of all the *.jpg filenames
in the following directories:
knifey-spoony/forky/test/
knifey-spoony/knifey/test/
knifey-spoony/spoony/test/
See the TensorFlow Tutorial #09 for a usage example.
:param in_dir:
Root-dir for the files in the data-set.
This would be 'knifey-spoony/' in the example above.
:param exts:
String or tuple of strings with valid filename-extensions.
Not case-sensitive.
:return:
Object instance.
"""
# Extend the input directory to the full path.
in_dir = os.path.abspath(in_dir)
# Input directory.
self.in_dir = in_dir
# Convert all file-extensions to lower-case.
self.exts = tuple(ext.lower() for ext in exts)
# Names for the classes.
self.class_names = []
# Filenames for all the files in the training-set.
self.filenames = []
# Filenames for all the files in the test-set.
self.filenames_test = []
# Class-number for each file in the training-set.
self.class_numbers = []
# Class-number for each file in the test-set.
self.class_numbers_test = []
# Total number of classes in the data-set.
self.num_classes = 0
# For all files/dirs in the input directory.
for name in os.listdir(in_dir):
# Full path for the file / dir.
current_dir = os.path.join(in_dir, name)
# If it is a directory.
if os.path.isdir(current_dir):
# Add the dir-name to the list of class-names.
self.class_names.append(name)
# Training-set.
# Get all the valid filenames in the dir (not sub-dirs).
filenames = self._get_filenames(current_dir)
# Append them to the list of all filenames for the training-set.
self.filenames.extend(filenames)
# The class-number for this class.
class_number = self.num_classes
# Create an array of class-numbers.
class_numbers = [class_number] * len(filenames)
# Append them to the list of all class-numbers for the training-set.
self.class_numbers.extend(class_numbers)
# Test-set.
# Get all the valid filenames in the sub-dir named 'test'.
filenames_test = self._get_filenames(os.path.join(current_dir, 'test'))
# Append them to the list of all filenames for the test-set.
self.filenames_test.extend(filenames_test)
# Create an array of class-numbers.
class_numbers = [class_number] * len(filenames_test)
# Append them to the list of all class-numbers for the test-set.
self.class_numbers_test.extend(class_numbers)
# Increase the total number of classes in the data-set.
self.num_classes += 1
def _get_filenames(self, dir):
"""
Create and return a list of filenames with matching extensions in the given directory.
:param dir:
Directory to scan for files. Sub-dirs are not scanned.
:return:
List of filenames. Only filenames. Does not include the directory.
"""
# Initialize empty list.
filenames = []
# If the directory exists.
if os.path.exists(dir):
# Get all the filenames with matching extensions.
for filename in os.listdir(dir):
if filename.lower().endswith(self.exts):
filenames.append(filename)
return filenames
def get_paths(self, test=False):
"""
Get the full paths for the files in the data-set.
:param test:
Boolean. Return the paths for the test-set (True) or training-set (False).
:return:
Iterator with strings for the path-names.
"""
if test:
# Use the filenames and class-numbers for the test-set.
filenames = self.filenames_test
class_numbers = self.class_numbers_test
# Sub-dir for test-set.
test_dir = "test/"
else:
# Use the filenames and class-numbers for the training-set.
filenames = self.filenames
class_numbers = self.class_numbers
# Don't use a sub-dir for test-set.
test_dir = ""
for filename, cls in zip(filenames, class_numbers):
# Full path-name for the file.
path = os.path.join(self.in_dir, self.class_names[cls], test_dir, filename)
yield path
def get_training_set(self):
"""
Return the list of paths for the files in the training-set,
and the list of class-numbers as integers,
and the class-numbers as one-hot encoded arrays.
"""
return list(self.get_paths()), np.asarray(self.class_numbers), one_hot_encoded(class_numbers=self.class_numbers,
num_classes=self.num_classes)
def get_test_set(self):
"""
Return the list of paths for the files in the test-set,
and the list of class-numbers as integers,
and the class-numbers as one-hot encoded arrays.
"""
return list(self.get_paths(test=True)), np.asarray(self.class_numbers_test), one_hot_encoded(class_numbers=self.class_numbers_test,
num_classes=self.num_classes)
def copy_files(self, train_dir, test_dir):
"""
Copy all the files in the training-set to train_dir
and copy all the files in the test-set to test_dir.
For example, the normal directory structure for the
different classes in the training-set is:
knifey-spoony/forky/
knifey-spoony/knifey/
knifey-spoony/spoony/
Normally the test-set is a sub-dir of the training-set:
knifey-spoony/forky/test/
knifey-spoony/knifey/test/
knifey-spoony/spoony/test/
But some APIs use another dir-structure for the training-set:
knifey-spoony/train/forky/
knifey-spoony/train/knifey/
knifey-spoony/train/spoony/
and for the test-set:
knifey-spoony/test/forky/
knifey-spoony/test/knifey/
knifey-spoony/test/spoony/
:param train_dir: Directory for the training-set e.g. 'knifey-spoony/train/'
:param test_dir: Directory for the test-set e.g. 'knifey-spoony/test/'
:return: Nothing.
"""
# Helper-function for actually copying the files.
def _copy_files(src_paths, dst_dir, class_numbers):
# Create a list of dirs for each class, e.g.:
# ['knifey-spoony/test/forky/',
# 'knifey-spoony/test/knifey/',
# 'knifey-spoony/test/spoony/']
class_dirs = [os.path.join(dst_dir, class_name + "/")
for class_name in self.class_names]
# Check if each class-directory exists, otherwise create it.
for dir in class_dirs:
if not os.path.exists(dir):
os.makedirs(dir)
# For all the file-paths and associated class-numbers,
# copy the file to the destination dir for that class.
for src, cls in zip(src_paths, class_numbers):
shutil.copy(src=src, dst=class_dirs[cls])
# Copy the files for the training-set.
_copy_files(src_paths=self.get_paths(test=False),
dst_dir=train_dir,
class_numbers=self.class_numbers)
print("- Copied training-set to:", train_dir)
# Copy the files for the test-set.
_copy_files(src_paths=self.get_paths(test=True),
dst_dir=test_dir,
class_numbers=self.class_numbers_test)
print("- Copied test-set to:", test_dir)
########################################################################
def load_cached(cache_path, in_dir):
"""
Wrapper-function for creating a DataSet-object, which will be
loaded from a cache-file if it already exists, otherwise a new
object will be created and saved to the cache-file.
This is useful if you need to ensure the ordering of the
filenames is consistent every time you load the data-set,
for example if you use the DataSet-object in combination
with Transfer Values saved to another cache-file, see e.g.
Tutorial #09 for an example of this.
:param cache_path:
File-path for the cache-file.
:param in_dir:
Root-dir for the files in the data-set.
This is an argument for the DataSet-init function.
:return:
The DataSet-object.
"""
print("Creating dataset from the files in: " + in_dir)
# If the object-instance for DataSet(in_dir=data_dir) already
# exists in the cache-file then reload it, otherwise create
# an object instance and save it to the cache-file for next time.
dataset = cache(cache_path=cache_path,
fn=DataSet, in_dir=in_dir)
return dataset
########################################################################
4、cache.py
########################################################################
#
# Cache-wrapper for a function or class.
#
# Save the result of calling a function or creating an object-instance
# to harddisk. This is used to persist the data so it can be reloaded
# very quickly and easily.
#
# Implemented in Python 3.5
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2016 by Magnus Erik Hvass Pedersen
#
########################################################################
import os
import pickle
import numpy as np
########################################################################
def cache(cache_path, fn, *args, **kwargs):
"""
Cache-wrapper for a function or class. If the cache-file exists
then the data is reloaded and returned, otherwise the function
is called and the result is saved to cache. The fn-argument can
also be a class instead, in which case an object-instance is
created and saved to the cache-file.
:param cache_path:
File-path for the cache-file.
:param fn:
Function or class to be called.
:param args:
Arguments to the function or class-init.
:param kwargs:
Keyword arguments to the function or class-init.
:return:
The result of calling the function or creating the object-instance.
"""
# If the cache-file exists.
if os.path.exists(cache_path):
# Load the cached data from the file.
with open(cache_path, mode='rb') as file:
obj = pickle.load(file)
print("- Data loaded from cache-file: " + cache_path)
else:
# The cache-file does not exist.
# Call the function / class-init with the supplied arguments.
obj = fn(*args, **kwargs)
# Save the data to a cache-file.
with open(cache_path, mode='wb') as file:
pickle.dump(obj, file)
print("- Data saved to cache-file: " + cache_path)
return obj
########################################################################
def convert_numpy2pickle(in_path, out_path):
"""
Convert a numpy-file to pickle-file.
The first version of the cache-function used numpy for saving the data.
Instead of re-calculating all the data, you can just convert the
cache-file using this function.
:param in_path:
Input file in numpy-format written using numpy.save().
:param out_path:
Output file written as a pickle-file.
:return:
Nothing.
"""
# Load the data using numpy.
data = np.load(in_path)
# Save the data using pickle.
with open(out_path, mode='wb') as file:
pickle.dump(data, file)
########################################################################
if __name__ == '__main__':
# This is a short example of using a cache-file.
# This is the function that will only get called if the result
# is not already saved in the cache-file. This would normally
# be a function that takes a long time to compute, or if you
# need persistent data for some other reason.
def expensive_function(a, b):
return a * b
print('Computing expensive_function() ...')
# Either load the result from a cache-file if it already exists,
# otherwise calculate expensive_function(a=123, b=456) and
# save the result to the cache-file for next time.
result = cache(cache_path='cache_expensive_function.pkl',
fn=expensive_function, a=123, b=456)
print('result =', result)
# Newline.
print()
# This is another example which saves an object to a cache-file.
# We want to cache an object-instance of this class.
# The motivation is to do an expensive computation only once,
# or if we need to persist the data for some other reason.
class ExpensiveClass:
def __init__(self, c, d):
self.c = c
self.d = d
self.result = c * d
def print_result(self):
print('c =', self.c)
print('d =', self.d)
print('result = c * d =', self.result)
print('Creating object from ExpensiveClass() ...')
# Either load the object from a cache-file if it already exists,
# otherwise make an object-instance ExpensiveClass(c=123, d=456)
# and save the object to the cache-file for the next time.
obj = cache(cache_path='cache_ExpensiveClass.pkl',
fn=ExpensiveClass, c=123, d=456)
obj.print_result()
########################################################################
5、download.py文件
########################################################################
#
# Functions for downloading and extracting data-files from the internet.
#
# Implemented in Python 3.5
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2016 by Magnus Erik Hvass Pedersen
#
########################################################################
import sys
import os
import urllib.request
import tarfile
import zipfile
########################################################################
def _print_download_progress(count, block_size, total_size):
"""
Function used for printing the download progress.
Used as a call-back function in maybe_download_and_extract().
"""
# Percentage completion.
pct_complete = float(count * block_size) / total_size
# Limit it because rounding errors may cause it to exceed 100%.
pct_complete = min(1.0, pct_complete)
# Status-message. Note the \r which means the line should overwrite itself.
msg = "\r- Download progress: {0:.1%}".format(pct_complete)
# Print it.
sys.stdout.write(msg)
sys.stdout.flush()
########################################################################
def download(base_url, filename, download_dir):
"""
Download the given file if it does not already exist in the download_dir.
:param base_url: The internet URL without the filename.
:param filename: The filename that will be added to the base_url.
:param download_dir: Local directory for storing the file.
:return: Nothing.
"""
# Path for local file.
save_path = os.path.join(download_dir, filename)
# Check if the file already exists, otherwise we need to download it now.
if not os.path.exists(save_path):
# Check if the download directory exists, otherwise create it.
if not os.path.exists(download_dir):
os.makedirs(download_dir)
print("Downloading", filename, "...")
# Download the file from the internet.
url = base_url + filename
file_path, _ = urllib.request.urlretrieve(url=url,
filename=save_path,
reporthook=_print_download_progress)
print(" Done!")
def maybe_download_and_extract(url, download_dir):
"""
Download and extract the data if it doesn't already exist.
Assumes the url is a tar-ball file.
:param url:
Internet URL for the tar-file to download.
Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
:param download_dir:
Directory where the downloaded file is saved.
Example: "data/CIFAR-10/"
:return:
Nothing.
"""
# Filename for saving the file downloaded from the internet.
# Use the filename from the URL and add it to the download_dir.
filename = url.split('/')[-1]
file_path = os.path.join(download_dir, filename)
# Check if the file already exists.
# If it exists then we assume it has also been extracted,
# otherwise we need to download and extract it now.
if not os.path.exists(file_path):
# Check if the download directory exists, otherwise create it.
if not os.path.exists(download_dir):
os.makedirs(download_dir)
# Download the file from the internet.
file_path, _ = urllib.request.urlretrieve(url=url,
filename=file_path,
reporthook=_print_download_progress)
print()
print("Download finished. Extracting files.")
if file_path.endswith(".zip"):
# Unpack the zip-file.
zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
elif file_path.endswith((".tar.gz", ".tgz")):
# Unpack the tar-ball.
tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
print("Done.")
else:
print("Data has apparently already been downloaded and unpacked.")
########################################################################
赞 (0)