OiO.lk Community platform!

Oio.lk is an excellent forum for developers, providing a wide range of resources, discussions, and support for those in the developer community. Join oio.lk today to connect with like-minded professionals, share insights, and stay updated on the latest trends and technologies in the development field.
  You need to log in or register to access the solved answers to this problem.
  • You have reached the maximum number of guest views allowed
  • Please register below to remove this limitation

pytorch dataloader very slow with hdf5 data

  • Thread starter Thread starter Jash Jasani
  • Start date Start date
J

Jash Jasani

Guest
I'm have a very large dataset in hdf5 format which I can not load in memory all at once. I'm using custom dataset from torch here's the code

Code:
import time
from utils import get_vocab_and_skipgrams
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import h5py
import numpy as np
import torch

class CustomSkipGramDataset(Dataset):
    def __init__(self, filename, window_size, data_dir="training_data", data_exists=True):
        self.window_size = window_size
        self.filename = filename
        self.data_exists = data_exists
        self.vocab_path = os.path.join(data_dir, "vocab.npy")
        self.hdf5_path = os.path.join(data_dir, "skipgram.h5")
        
        if not data_exists:
            get_vocab_and_skipgrams(filename, data_dir)
        
        self.vocab = np.load(self.vocab_path, allow_pickle=True).tolist()
        self.vocab_size = len(self.vocab)
        self.hf = h5py.File(self.hdf5_path, "r")
        self.dataset = self.hf["positive_skips"]
        
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, index):
        
        x, y = self.dataset[index]
        return x, y

Now when I'm loading it directly like this

Code:
with h5py.File("./training_data/skipgram.h5") as hf:
    dataset = hf["positive_skips"]
    for a in range(1,100):
        print(torch.tensor(dataset[a:100*a]))

it is indeed very fast compared to torch custom dataset. 100x faster almost. I know I'm doing something wrong.
<p>I'm have a very large dataset in hdf5 format which I can not load in memory all at once. I'm using custom dataset from torch here's the code</p>
<pre><code>import time
from utils import get_vocab_and_skipgrams
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import h5py
import numpy as np
import torch

class CustomSkipGramDataset(Dataset):
def __init__(self, filename, window_size, data_dir="training_data", data_exists=True):
self.window_size = window_size
self.filename = filename
self.data_exists = data_exists
self.vocab_path = os.path.join(data_dir, "vocab.npy")
self.hdf5_path = os.path.join(data_dir, "skipgram.h5")

if not data_exists:
get_vocab_and_skipgrams(filename, data_dir)

self.vocab = np.load(self.vocab_path, allow_pickle=True).tolist()
self.vocab_size = len(self.vocab)
self.hf = h5py.File(self.hdf5_path, "r")
self.dataset = self.hf["positive_skips"]

def __len__(self):
return self.dataset.shape[0]

def __getitem__(self, index):

x, y = self.dataset[index]
return x, y
</code></pre>
<p>Now when I'm loading it directly like this</p>
<pre><code>with h5py.File("./training_data/skipgram.h5") as hf:
dataset = hf["positive_skips"]
for a in range(1,100):
print(torch.tensor(dataset[a:100*a]))

</code></pre>
<p>it is indeed very fast compared to torch custom dataset. 100x faster almost. I know I'm doing something wrong.</p>
 

Latest posts

Top