# -*- coding: utf-8 -*-

"""
Tests for MosesTokenizer
"""

import os
import unittest
import urllib.request

from sacremoses.truecase import MosesTruecaser, MosesDetruecaser


def get_content(url):
    with urllib.request.urlopen(url) as response:
        return response.read()  # Returns http.client.HTTPResponse.


class TestTruecaser(unittest.TestCase):
    def test_moses_truecase_documents(self):
        moses = MosesTruecaser()
        # Train the model from documents.
        docs = [line.split() for line in self.big_txt.split("\n")]
        moses.train(docs)
        # Test all self.input_output test cases.
        for _input, _output in self.input_output.items():
            self.assertEqual(moses.truecase(_input), _output)

    def test_moses_truecase_file(self):
        moses = MosesTruecaser()
        # Train the model from file.
        moses.train_from_file("big.txt")
        # Test all self.input_output test cases.
        for _input, _output in self.input_output.items():
            self.assertEqual(moses.truecase(_input), _output)

    def setUp(self):
        # Check if the Norvig's big.txt file exists.
        if os.path.isfile("big.txt"):
            with open("big.txt") as fin:
                self.big_txt = fin.read()
        else:  # Otherwise, download the big.txt.
            try:  # Download from the original norvig.com
                self.big_txt = get_content("https://norvig.com/big.txt").decode("utf8")
            except:  # Otherwise get it from the github gist mirror.
                big_text_url = str(
                    "https://gist.githubusercontent.com/alvations/"
                    "6e878bab0eda2624167aa7ec13fc3e94/raw/"
                    "4fb3bac1da1ba7a172ff1936e96bee3bc8892931/"
                    "big.txt"
                )
                self.big_txt = get_content(big_text_url).decode("utf8")
            with open("big.txt", "w") as fout:
                fout.write(self.big_txt)

        # Test case where inputs are all caps.
        caps_input = "THE ADVENTURES OF SHERLOCK HOLMES"
        expected_caps_output = ["the", "adventures", "of", "Sherlock", "Holmes"]

        # Test normal input to truecase.
        normal_input = str(
            "You can also find out about how to make a donation "
            "to Project Gutenberg, and how to get involved."
        )
        expecte_normal_output = [
            "you",
            "can",
            "also",
            "find",
            "out",
            "about",
            "how",
            "to",
            "make",
            "a",
            "donation",
            "to",
            "Project",
            "Gutenberg,",
            "and",
            "how",
            "to",
            "get",
            "involved.",
        ]

        # Keep a key-value pairs of in/outputs.
        self.input_output = {
            caps_input: expected_caps_output,
            normal_input: expecte_normal_output,
        }

    def test_use_known(self):
        moses = MosesTruecaser()
        moses.train([
            ['Start', 'a', 'a', 'A', 'A', 'a', 'a', '.'], # 'a' is best, but 'A' is also known.
        ])
        self.assertEqual(moses.truecase('Start A .', use_known=False), ['Start', 'a', '.'])
        self.assertEqual(moses.truecase('Start A .', use_known=True), ['Start', 'A', '.'])

    def test_delayed_sentence_start(self):
        """Test that first word after delayed sentence start still holds, and
        thus ignores use_known for that particular token."""
        moses = MosesTruecaser()
        moses.train([
            ['Start', 'Start', 'start'] # 'Start' is best, but 'start' is okay.
        ])
        self.assertEqual(moses.truecase('" start start', use_known=True), ['"', 'Start', 'start'])


class TestDetruecaser(unittest.TestCase):
    def test_moses_detruecase_str(self):
        moses = MosesDetruecaser()
        text = "the adventures of Sherlock Holmes"
        expected = ["The", "adventures", "of", "Sherlock", "Holmes"]
        expected_str = "The adventures of Sherlock Holmes"
        assert moses.detruecase(text) == expected
        assert moses.detruecase(text, return_str=True) == expected_str

    def test_moses_detruecase_headline(self):
        moses = MosesDetruecaser()
        text = "the adventures of Sherlock Holmes"
        expected = ["The", "Adventures", "of", "Sherlock", "Holmes"]
        expected_str = "The Adventures of Sherlock Holmes"
        assert moses.detruecase(text, is_headline=True) == expected
        assert moses.detruecase(text, is_headline=True, return_str=True) == expected_str

    def test_moses_detruecase_allcaps(self):
        moses = MosesDetruecaser()
        text = "MLB Baseball standings"
        expected = ["MLB", "Baseball", "standings"]
        expected_str = "MLB Baseball standings"
        assert moses.detruecase(text) == expected
        assert moses.detruecase(text, return_str=True) == expected_str
