import os
import json
import gzip

from sentence_transformers import util


def download_simplewiki(dataset_path):
  if not os.path.exists(dataset_path):
    # util.http_get('https://sbert.net/datasets/{}'.format(dataset_path), dataset_path)
    util.http_get('https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/{}'.format(dataset_path), dataset_path)


def get_all_passages(dataset_path):
  all_passages = []
  with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
    for line in fIn:
      data = json.loads(line.strip())

      for passage in data['paragraphs']:
        all_passages.append({
            'passage': passage,
            'article_id': data['id']
        })
  return all_passages


def get_all_articles(dataset_path):
  all_articles = {}

  with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
    for line in fIn:
      data = json.loads(line.strip())

      article = {
          'id': data['id'],
          'title': data['title'],
          'content': '\n'.join(data['paragraphs'])
      }
      all_articles[data['id']] = article

  return all_articles


def get_dataset(dataset_path):
  download_simplewiki(dataset_path)
  
  passages = get_all_passages(dataset_path)
  articles = get_all_articles(dataset_path)

  assert len(passages) == 509663
  assert len(articles) == 169597

  os.remove(dataset_path)

  return {
    'passages': passages,
    'articles': articles,
  }