commit 0c6baed39888bd1e0b341b482ff2a4167cbab202 Author: svxf Date: Fri Mar 15 13:54:11 2019 +0400 git reimport diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..f5a784d --- /dev/null +++ b/Pipfile @@ -0,0 +1,14 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +numpy = "*" +keras = "*" +keras-rl = "*" + +[dev-packages] + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..1113bf6 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,175 @@ +{ + "_meta": { + "hash": { + "sha256": "a227a69980d84f05f5a653bcc6f1cc461e95f114074a0baae20d457cc84e7ebc" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "h5py": { + "hashes": [ + "sha256:0f8cd2acbacf3177b4427ed42639c911667b1f24d923388ab1f8ad466a12be5e", + "sha256:11277e3879098f921ee9e29105b20591e1dfdd44963357399f2abaa1a280c560", + "sha256:1241dec0c94ac32f3285cac1d6f44beabf80423e422ab03bd2686d731a8a9294", + "sha256:17b8187de0b3a945d8e8d031e7eb6ece2fce90791f9c5fde36f4396bf38fdde1", + "sha256:2f30007d0796788a454c1293262f19f25e6428317d3d386f78138fba2a44e37d", + "sha256:308e0758587ee16d4e73e7f2f8aae8351091e343bf0a43d2f697f9535465c816", + "sha256:37cacddf0e8209905f52537a8cf71da0dd9a4de62bd79247274c97b24a408997", + "sha256:38a23bb599748adf23d77f74885c0de6f4a7d9baa42f74e476bbf90fba2b47dd", + "sha256:47ab18b7b7bbc36fd2b606289b703b6f0ee915b923d6ad94dd17ac80ebffc280", + "sha256:486c78330af0bf33f5077b51d1888c0739c3cd1a03d5aade0d48572b3b5690ca", + "sha256:4e2183458d6ef1ae87dfb5d6acd0786359336cd9ac0ece6396c09b59fdaa3bd6", + "sha256:51d0595c3e58814c831f6cd2b664a5bf9590e26262c1d541b380d041e4fcb3c0", + "sha256:56d259d56822b70881760b243957f04a0cf133f0ec65eae6a33f562826aee899", + "sha256:5e6e777653169a3cc24ea56bb3d8c845ea391f8914c35bb6f350b0753a52891c", + "sha256:62bfb0ebb0f59e5dccc0b0dbbc0fc40dd1d1e09d04c0dc71f89790231531d4a2", + "sha256:67d89b64debfa021b54aa6f24bbf008403bd144748a0148596b518bce80d2fc4", + "sha256:6bf38571f555fa214493ec6349d29024cc5f313bf1715b09f236c553fd22ae4d", + "sha256:9214ca445c18a37bfe9c165982c0e317e2f21f035c8d635d1c6d9fcbaf35b7a8", + "sha256:ab0c52850428d2e86029935389379c2c97f752e76b616da851deec8a4484f8ec", + "sha256:b2eff336697d8dfd712c5d93fef9f4e4d3e97d9d8c258801836b8664a239e07a", + "sha256:bb33fabc0b8f3fe3bb0f8d6821b2fad5b2a64c27a0808e8d1c5c1e3362062064", + "sha256:bd5353ab342bae1262b04745934cc1565df4cbc8d6a979a0c98f42209bd5c265", + "sha256:bd73444efd1ac06dac27b8405bbe8791a02fd1bc8a2fa0e575257f90b7b57467", + "sha256:bd932236a2ef91a75fee5d7f4ace80ab494c5a59cd092a67c9785ddb7fdc218c", + "sha256:c45650de228ace7731e4280e14fb687f6d5c29cd666c5b22b42492b035e994d6", + "sha256:d5c0c01da45f901a3d429e7ef9e7e22baa869e1affb8715f1bf94e6a30020740", + "sha256:d75035db5bde802a29f4f29f18bb7548863d29ac90ccbf2c04c11799bbbba2c3", + "sha256:dda88206dc9464923f27f601000bc5b152ac0bd6d0122f098d4f239150a70076", + "sha256:e1c2ac5d0aa232c0f60fecc6bd1122346885086a176f939b91058c4c980cc226", + "sha256:e626c65a8587921ebc7fb8d31a49addfdd0b9a9aa96315ea484c09803337b955" + ], + "version": "==2.8.0" + }, + "keras": { + "hashes": [ + "sha256:794d0c92c6c4122f1f0fcf3a7bc2f49054c6a54ddbef8d8ffafca62795d760b6", + "sha256:90b610a3dbbf6d257b20a079eba3fdf2eed2158f64066a7c6f7227023fd60bc9" + ], + "index": "pypi", + "version": "==2.2.4" + }, + "keras-applications": { + "hashes": [ + "sha256:721dda4fa4e043e5bbd6f52a2996885c4639a7130ae478059b3798d0706f5ae7", + "sha256:a03af60ddc9c5afdae4d5c9a8dd4ca857550e0b793733a5072e0725829b87017" + ], + "version": "==1.0.6" + }, + "keras-preprocessing": { + "hashes": [ + "sha256:90d04c1750bccceef88ac09475c291b4b5f6aa1eaf0603167061b1aa8b043c61", + "sha256:ef2e482c4336fcf7180244d06f4374939099daa3183816e82aee7755af35b754" + ], + "version": "==1.0.5" + }, + "keras-rl": { + "hashes": [ + "sha256:7bbbb24c8f4560a03f59fb062a5003da102de033bc8cd7d06b69b4c1b48ec054" + ], + "index": "pypi", + "version": "==0.4.2" + }, + "numpy": { + "hashes": [ + "sha256:032df9b6571c5f1d41ea6f6a189223208cb488990373aa686aca55570fcccb42", + "sha256:094f8a83e5bd0a44a7557fa24a46db6ba7d5299c389ddbc9e0e18722f567fb63", + "sha256:1c0c80e74759fa4942298044274f2c11b08c86230b25b8b819e55e644f5ff2b6", + "sha256:2aa0910eaeb603b1a5598193cc3bc8eacf1baf6c95cbc3955eb8e15fa380c133", + "sha256:2f5ebc7a04885c7d69e5daa05208faef4db7f1ae6a99f4d36962df8cd54cdc76", + "sha256:32a07241cb624e104b88b08dea2851bf4ec5d65a1f599d7735041ced7171fd7a", + "sha256:3c7959f750b54b445f14962a3ddc41b9eadbab00b86da55fbb1967b2b79aad10", + "sha256:3d8f9273c763a139a99e65c2a3c10f1109df30bedae7f011b10d95c538364704", + "sha256:63bca71691339d2d6f8a7c970821f2b12098a53afccc0190d4e1555e75e5223a", + "sha256:7ae9c3baff3b989859c88e0168ad10902118595b996bf781eaf011bb72428798", + "sha256:866a7c8774ccc7d603667fad95456b4cf56d79a2bb5a7648ac9f0082e0b9416e", + "sha256:8bc4b92a273659e44ca3f3a2f8786cfa39d8302223bcfe7df794429c63d5f5a1", + "sha256:919f65e0732195474897b1cafefb4d4e7c2bb8174a725e506b62e9096e4df28d", + "sha256:9d1598573d310104acb90377f0a8c2319f737084689f5eb18012becaf345cda5", + "sha256:9fff90c88bfaad2901be50453d5cd7897a826c1d901f0654ee1d73ab3a48cd18", + "sha256:a245464ddf6d90e2d6287e9cef6bcfda2a99467fdcf1b677b99cd0b6c7b43de2", + "sha256:a988db28f54e104a01e8573ceb6f28202b4c15635b1450b2e3b2b822c6564f9b", + "sha256:b12fe6f31babb9477aa0f9692730654b3ee0e71f33b4568170dfafd439caf0a2", + "sha256:b7599ff4acd23f5de983e3aec772153b1043e131487a5c6ad0f94b41a828877a", + "sha256:c9f4dafd6065c4c782be84cd67ceeb9b1d4380af60a7af32be10ebecd723385e", + "sha256:ce3622b73ccd844ba301c1aea65d36cf9d8331e7c25c16b1725d0f14db99aaf4", + "sha256:d0f36a24cf8061a2c03e151be3418146717505b9b4ec17502fa3bbdb04ec1431", + "sha256:d263f8f14f2da0c079c0297e829e550d8f2c4e0ffef215506bd1d0ddd2bff3de", + "sha256:d8837ff272800668aabdfe70b966631914b0d6513aed4fc1b1428446f771834d", + "sha256:ef694fe72a3995aa778a5095bda946e0d31f7efabd5e8063ad8c6238ab7d3f78", + "sha256:f1fd1a6f40a501ba4035f5ed2c1f4faa68245d1407bf97d2ee401e4f23d1720b", + "sha256:fa337b6bd5fe2b8c4e705f4102186feb9985de9bb8536d32d5129a658f1789e0", + "sha256:febd31cd0d2fd2509ca2ec53cb339f8bf593c1bd245b9fc55c1917a68532a0af" + ], + "index": "pypi", + "version": "==1.15.3" + }, + "pyyaml": { + "hashes": [ + "sha256:3d7da3009c0f3e783b2c873687652d83b1bbfd5c88e9813fb7e5b03c0dd3108b", + "sha256:3ef3092145e9b70e3ddd2c7ad59bdd0252a94dfe3949721633e41344de00a6bf", + "sha256:40c71b8e076d0550b2e6380bada1f1cd1017b882f7e16f09a65be98e017f211a", + "sha256:558dd60b890ba8fd982e05941927a3911dc409a63dcb8b634feaa0cda69330d3", + "sha256:a7c28b45d9f99102fa092bb213aa12e0aaf9a6a1f5e395d36166639c1f96c3a1", + "sha256:aa7dd4a6a427aed7df6fb7f08a580d68d9b118d90310374716ae90b710280af1", + "sha256:bc558586e6045763782014934bfaf39d48b8ae85a2713117d16c39864085c613", + "sha256:d46d7982b62e0729ad0175a9bc7e10a566fc07b224d2c79fafb5e032727eaa04", + "sha256:d5eef459e30b09f5a098b9cea68bebfeb268697f78d647bd255a085371ac7f3f", + "sha256:e01d3203230e1786cd91ccfdc8f8454c8069c91bee3962ad93b87a4b2860f537", + "sha256:e170a9e6fcfd19021dd29845af83bb79236068bf5fd4df3327c1be18182b2531" + ], + "version": "==3.13" + }, + "scipy": { + "hashes": [ + "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", + "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", + "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", + "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", + "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", + "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", + "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", + "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", + "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", + "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", + "sha256:42d9149a2fff7affdd352d157fa5717033767857c11bd55aa4a519a44343dfef", + "sha256:625f25a6b7d795e8830cb70439453c9f163e6870e710ec99eba5722775b318f3", + "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", + "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", + "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", + "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", + "sha256:8b984f0821577d889f3c7ca8445564175fb4ac7c7f9659b7c60bef95b2b70e76", + "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", + "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", + "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", + "sha256:d40dc7f494b06dcee0d303e51a00451b2da6119acbeaccf8369f2d29e28917ac", + "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", + "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", + "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", + "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", + "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", + "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40", + "sha256:f25c281f12c0da726c6ed00535ca5d1622ec755c30a3f8eafef26cf43fede694" + ], + "version": "==1.1.0" + }, + "six": { + "hashes": [ + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + ], + "version": "==1.11.0" + } + }, + "develop": {} +} diff --git a/libs/gym b/libs/gym new file mode 160000 index 0000000..e944885 --- /dev/null +++ b/libs/gym @@ -0,0 +1 @@ +Subproject commit e944885e3b31a10fb6973093b39ff7682ef3aa3d diff --git a/pilesos2.log b/pilesos2.log new file mode 100644 index 0000000..b8d56f9 --- /dev/null +++ b/pilesos2.log @@ -0,0 +1,169 @@ +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.LEFT +Action.LEFT +Action.FORWARD +Action.SUCK +Action.LEFT +Action.FORWARD +Action.LEFT +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.SUCK +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.LEFT +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.LEFT +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.LEFT +Action.LEFT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.LEFT +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.SUCK +Action.RIGHT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.LEFT +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.FORWARD +Action.RIGHT +Action.FORWARD +Action.FORWARD +Action.DIE diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/__init__.py b/src/gym/__init__.py new file mode 100644 index 0000000..ec6623a --- /dev/null +++ b/src/gym/__init__.py @@ -0,0 +1,14 @@ +import distutils.version +import os +import sys +import warnings + +from gym import error +from gym.utils import reraise +from gym.version import VERSION as __version__ + +from gym.core import Env, GoalEnv, Space, Wrapper, ObservationWrapper, ActionWrapper, RewardWrapper +from gym.envs import make, spec +from gym import logger + +__all__ = ["Env", "Space", "Wrapper", "make", "spec"] diff --git a/src/gym/__pycache__/__init__.cpython-37.pyc b/src/gym/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..cecc724 Binary files /dev/null and b/src/gym/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/__pycache__/core.cpython-37.pyc b/src/gym/__pycache__/core.cpython-37.pyc new file mode 100644 index 0000000..420e8cd Binary files /dev/null and b/src/gym/__pycache__/core.cpython-37.pyc differ diff --git a/src/gym/__pycache__/error.cpython-37.pyc b/src/gym/__pycache__/error.cpython-37.pyc new file mode 100644 index 0000000..bbe5403 Binary files /dev/null and b/src/gym/__pycache__/error.cpython-37.pyc differ diff --git a/src/gym/__pycache__/logger.cpython-37.pyc b/src/gym/__pycache__/logger.cpython-37.pyc new file mode 100644 index 0000000..0ca2a22 Binary files /dev/null and b/src/gym/__pycache__/logger.cpython-37.pyc differ diff --git a/src/gym/__pycache__/version.cpython-37.pyc b/src/gym/__pycache__/version.cpython-37.pyc new file mode 100644 index 0000000..19f7720 Binary files /dev/null and b/src/gym/__pycache__/version.cpython-37.pyc differ diff --git a/src/gym/core.py b/src/gym/core.py new file mode 100644 index 0000000..7ea2143 --- /dev/null +++ b/src/gym/core.py @@ -0,0 +1,343 @@ +from gym import logger + +import gym +from gym import error +from gym.utils import closer + +env_closer = closer.Closer() + +# Env-related abstractions + +class Env(object): + """The main OpenAI Gym class. It encapsulates an environment with + arbitrary behind-the-scenes dynamics. An environment can be + partially or fully observed. + + The main API methods that users of this class need to know are: + + step + reset + render + close + seed + + And set the following attributes: + + action_space: The Space object corresponding to valid actions + observation_space: The Space object corresponding to valid observations + reward_range: A tuple corresponding to the min and max possible rewards + + Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range. + + The methods are accessed publicly as "step", "reset", etc.. The + non-underscored versions are wrapper methods to which we may add + functionality over time. + """ + + # Set this in SOME subclasses + metadata = {'render.modes': []} + reward_range = (-float('inf'), float('inf')) + spec = None + + # Set these in ALL subclasses + action_space = None + observation_space = None + + def step(self, action): + """Run one timestep of the environment's dynamics. When end of + episode is reached, you are responsible for calling `reset()` + to reset this environment's state. + + Accepts an action and returns a tuple (observation, reward, done, info). + + Args: + action (object): an action provided by the environment + + Returns: + observation (object): agent's observation of the current environment + reward (float) : amount of reward returned after previous action + done (boolean): whether the episode has ended, in which case further step() calls will return undefined results + info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning) + """ + raise NotImplementedError + + def reset(self): + """Resets the state of the environment and returns an initial observation. + + Returns: observation (object): the initial observation of the + space. + """ + raise NotImplementedError + + def render(self, mode='human'): + """Renders the environment. + + The set of supported modes varies per environment. (And some + environments do not support rendering at all.) By convention, + if mode is: + + - human: render to the current display or terminal and + return nothing. Usually for human consumption. + - rgb_array: Return an numpy.ndarray with shape (x, y, 3), + representing RGB values for an x-by-y pixel image, suitable + for turning into a video. + - ansi: Return a string (str) or StringIO.StringIO containing a + terminal-style text representation. The text can include newlines + and ANSI escape sequences (e.g. for colors). + + Note: + Make sure that your class's metadata 'render.modes' key includes + the list of supported modes. It's recommended to call super() + in implementations to use the functionality of this method. + + Args: + mode (str): the mode to render with + close (bool): close all open renderings + + Example: + + class MyEnv(Env): + metadata = {'render.modes': ['human', 'rgb_array']} + + def render(self, mode='human'): + if mode == 'rgb_array': + return np.array(...) # return RGB frame suitable for video + elif mode is 'human': + ... # pop up a window and render + else: + super(MyEnv, self).render(mode=mode) # just raise an exception + """ + raise NotImplementedError + + def close(self): + """Override _close in your subclass to perform any necessary cleanup. + + Environments will automatically close() themselves when + garbage collected or when the program exits. + """ + return + + def seed(self, seed=None): + """Sets the seed for this env's random number generator(s). + + Note: + Some environments use multiple pseudorandom number generators. + We want to capture all such seeds used in order to ensure that + there aren't accidental correlations between multiple generators. + + Returns: + list: Returns the list of seeds used in this env's random + number generators. The first value in the list should be the + "main" seed, or the value which a reproducer should pass to + 'seed'. Often, the main seed equals the provided 'seed', but + this won't be true if seed=None, for example. + """ + logger.warn("Could not seed environment %s", self) + return + + @property + def unwrapped(self): + """Completely unwrap this env. + + Returns: + gym.Env: The base non-wrapped gym.Env instance + """ + return self + + def __str__(self): + if self.spec is None: + return '<{} instance>'.format(type(self).__name__) + else: + return '<{}<{}>>'.format(type(self).__name__, self.spec.id) + + +class GoalEnv(Env): + """A goal-based environment. It functions just as any regular OpenAI Gym environment but it + imposes a required structure on the observation_space. More concretely, the observation + space is required to contain at least three elements, namely `observation`, `desired_goal`, and + `achieved_goal`. Here, `desired_goal` specifies the goal that the agent should attempt to achieve. + `achieved_goal` is the goal that it currently achieved instead. `observation` contains the + actual observations of the environment as per usual. + """ + + def reset(self): + # Enforce that each GoalEnv uses a Goal-compatible observation space. + if not isinstance(self.observation_space, gym.spaces.Dict): + raise error.Error('GoalEnv requires an observation space of type gym.spaces.Dict') + result = super(GoalEnv, self).reset() + for key in ['observation', 'achieved_goal', 'desired_goal']: + if key not in result: + raise error.Error('GoalEnv requires the "{}" key to be part of the observation dictionary.'.format(key)) + return result + + def compute_reward(self, achieved_goal, desired_goal, info): + """Compute the step reward. This externalizes the reward function and makes + it dependent on an a desired goal and the one that was achieved. If you wish to include + additional rewards that are independent of the goal, you can include the necessary values + to derive it in info and compute it accordingly. + + Args: + achieved_goal (object): the goal that was achieved during execution + desired_goal (object): the desired goal that we asked the agent to attempt to achieve + info (dict): an info dictionary with additional information + + Returns: + float: The reward that corresponds to the provided achieved goal w.r.t. to the desired + goal. Note that the following should always hold true: + + ob, reward, done, info = env.step() + assert reward == env.compute_reward(ob['achieved_goal'], ob['goal'], info) + """ + raise NotImplementedError() + +# Space-related abstractions + +class Space(object): + """Defines the observation and action spaces, so you can write generic + code that applies to any Env. For example, you can choose a random + action. + """ + def __init__(self, shape=None, dtype=None): + import numpy as np # takes about 300-400ms to import, so we load lazily + self.shape = None if shape is None else tuple(shape) + self.dtype = None if dtype is None else np.dtype(dtype) + + def sample(self): + """ + Uniformly randomly sample a random element of this space + """ + raise NotImplementedError + + def contains(self, x): + """ + Return boolean specifying if x is a valid + member of this space + """ + raise NotImplementedError + + __contains__ = contains + + def to_jsonable(self, sample_n): + """Convert a batch of samples from this space to a JSONable data type.""" + # By default, assume identity is JSONable + return sample_n + + def from_jsonable(self, sample_n): + """Convert a JSONable data type to a batch of samples from this space.""" + # By default, assume identity is JSONable + return sample_n + + +warn_once = True + +def deprecated_warn_once(text): + global warn_once + if not warn_once: return + warn_once = False + logger.warn(text) + + +class Wrapper(Env): + env = None + + def __init__(self, env): + self.env = env + self.action_space = self.env.action_space + self.observation_space = self.env.observation_space + self.reward_range = self.env.reward_range + self.metadata = self.env.metadata + + @classmethod + def class_name(cls): + return cls.__name__ + + def step(self, action): + if hasattr(self, "_step"): + deprecated_warn_once("%s doesn't implement 'step' method, but it implements deprecated '_step' method." % type(self)) + self.step = self._step + return self.step(action) + else: + deprecated_warn_once("%s doesn't implement 'step' method, " % type(self) + + "which is required for wrappers derived directly from Wrapper. Deprecated default implementation is used.") + return self.env.step(action) + + def reset(self, **kwargs): + if hasattr(self, "_reset"): + deprecated_warn_once("%s doesn't implement 'reset' method, but it implements deprecated '_reset' method." % type(self)) + self.reset = self._reset + return self._reset(**kwargs) + else: + deprecated_warn_once("%s doesn't implement 'reset' method, " % type(self) + + "which is required for wrappers derived directly from Wrapper. Deprecated default implementation is used.") + return self.env.reset(**kwargs) + + def render(self, mode='human', **kwargs): + return self.env.render(mode, **kwargs) + + def close(self): + if self.env: + return self.env.close() + + def seed(self, seed=None): + return self.env.seed(seed) + + def compute_reward(self, achieved_goal, desired_goal, info): + return self.env.compute_reward(achieved_goal, desired_goal, info) + + def __str__(self): + return '<{}{}>'.format(type(self).__name__, self.env) + + def __repr__(self): + return str(self) + + @property + def unwrapped(self): + return self.env.unwrapped + + @property + def spec(self): + return self.env.spec + + +class ObservationWrapper(Wrapper): + def step(self, action): + observation, reward, done, info = self.env.step(action) + return self.observation(observation), reward, done, info + + def reset(self, **kwargs): + observation = self.env.reset(**kwargs) + return self.observation(observation) + + def observation(self, observation): + deprecated_warn_once("%s doesn't implement 'observation' method. Maybe it implements deprecated '_observation' method." % type(self)) + return self._observation(observation) + + +class RewardWrapper(Wrapper): + def reset(self): + return self.env.reset() + + def step(self, action): + observation, reward, done, info = self.env.step(action) + return observation, self.reward(reward), done, info + + def reward(self, reward): + deprecated_warn_once("%s doesn't implement 'reward' method. Maybe it implements deprecated '_reward' method." % type(self)) + return self._reward(reward) + + +class ActionWrapper(Wrapper): + def step(self, action): + action = self.action(action) + return self.env.step(action) + + def reset(self): + return self.env.reset() + + def action(self, action): + deprecated_warn_once("%s doesn't implement 'action' method. Maybe it implements deprecated '_action' method." % type(self)) + return self._action(action) + + def reverse_action(self, action): + deprecated_warn_once("%s doesn't implement 'reverse_action' method. Maybe it implements deprecated '_reverse_action' method." % type(self)) + return self._reverse_action(action) diff --git a/src/gym/envs/README.md b/src/gym/envs/README.md new file mode 100644 index 0000000..5ae5774 --- /dev/null +++ b/src/gym/envs/README.md @@ -0,0 +1,113 @@ +# Envs + +These are the core integrated environments. Note that we may later +restructure any of the files, but will keep the environments available +at the relevant package's top-level. So for example, you should access +`AntEnv` as follows: + +``` +# Will be supported in future releases +from gym.envs import mujoco +mujoco.AntEnv +``` + +Rather than: + +``` +# May break in future releases +from gym.envs.mujoco import ant +ant.AntEnv +``` + +## How to create new environments for Gym + +* Create a new repo called gym-foo, which should also be a PIP package. + +* A good example is https://github.com/openai/gym-soccer. + +* It should have at least the following files: + ```sh + gym-foo/ + README.md + setup.py + gym_foo/ + __init__.py + envs/ + __init__.py + foo_env.py + foo_extrahard_env.py + ``` + +* `gym-foo/setup.py` should have: + + ```python + from setuptools import setup + + setup(name='gym_foo', + version='0.0.1', + install_requires=['gym'] # And any other dependencies foo needs + ) + ``` + +* `gym-foo/gym_foo/__init__.py` should have: + ```python + from gym.envs.registration import register + + register( + id='foo-v0', + entry_point='gym_foo.envs:FooEnv', + ) + register( + id='foo-extrahard-v0', + entry_point='gym_foo.envs:FooExtraHardEnv', + ) + ``` + +* `gym-foo/gym_foo/envs/__init__.py` should have: + ```python + from gym_foo.envs.foo_env import FooEnv + from gym_foo.envs.foo_extrahard_env import FooExtraHardEnv + ``` + +* `gym-foo/gym_foo/envs/foo_env.py` should look something like: + ```python + import gym + from gym import error, spaces, utils + from gym.utils import seeding + + class FooEnv(gym.Env): + metadata = {'render.modes': ['human']} + + def __init__(self): + ... + def step(self, action): + ... + def reset(self): + ... + def render(self, mode='human', close=False): + ... + ``` + +## How to add new environments to Gym, within this repo (not recommended for new environments) + +1. Write your environment in an existing collection or a new collection. All collections are subfolders of `/gym/envs'. +2. Import your environment into the `__init__.py` file of the collection. This file will be located at `/gym/envs/my_collection/__init__.py`. Add `from gym.envs.my_collection.my_awesome_env import MyEnv` to this file. +3. Register your env in `/gym/envs/__init__.py`: + + ``` +register( + id='MyEnv-v0', + entry_point='gym.envs.my_collection:MyEnv', +) +``` + +4. Add your environment to the scoreboard in `/gym/scoreboard/__init__.py`: + + ``` +add_task( + id='MyEnv-v0', + summary="Super cool environment", + group='my_collection', + contributor='mygithubhandle', +) +``` diff --git a/src/gym/envs/__init__.py b/src/gym/envs/__init__.py new file mode 100644 index 0000000..cf2978e --- /dev/null +++ b/src/gym/envs/__init__.py @@ -0,0 +1,540 @@ +from gym.envs.registration import registry, register, make, spec + +# Algorithmic +# ---------------------------------------- + +register( + id='Copy-v0', + entry_point='gym.envs.algorithmic:CopyEnv', + max_episode_steps=200, + reward_threshold=25.0, +) + +register( + id='RepeatCopy-v0', + entry_point='gym.envs.algorithmic:RepeatCopyEnv', + max_episode_steps=200, + reward_threshold=75.0, +) + +register( + id='ReversedAddition-v0', + entry_point='gym.envs.algorithmic:ReversedAdditionEnv', + kwargs={'rows': 2}, + max_episode_steps=200, + reward_threshold=25.0, +) + +register( + id='ReversedAddition3-v0', + entry_point='gym.envs.algorithmic:ReversedAdditionEnv', + kwargs={'rows': 3}, + max_episode_steps=200, + reward_threshold=25.0, +) + +register( + id='DuplicatedInput-v0', + entry_point='gym.envs.algorithmic:DuplicatedInputEnv', + max_episode_steps=200, + reward_threshold=9.0, +) + +register( + id='Reverse-v0', + entry_point='gym.envs.algorithmic:ReverseEnv', + max_episode_steps=200, + reward_threshold=25.0, +) + +# Classic +# ---------------------------------------- + +register( + id='CartPole-v0', + entry_point='gym.envs.classic_control:CartPoleEnv', + max_episode_steps=200, + reward_threshold=195.0, +) + +register( + id='CartPole-v1', + entry_point='gym.envs.classic_control:CartPoleEnv', + max_episode_steps=500, + reward_threshold=475.0, +) + +register( + id='MountainCar-v0', + entry_point='gym.envs.classic_control:MountainCarEnv', + max_episode_steps=200, + reward_threshold=-110.0, +) + +register( + id='MountainCarContinuous-v0', + entry_point='gym.envs.classic_control:Continuous_MountainCarEnv', + max_episode_steps=999, + reward_threshold=90.0, +) + +register( + id='Pendulum-v0', + entry_point='gym.envs.classic_control:PendulumEnv', + max_episode_steps=200, +) + +register( + id='Acrobot-v1', + entry_point='gym.envs.classic_control:AcrobotEnv', + max_episode_steps=500, +) + +# Box2d +# ---------------------------------------- + +register( + id='LunarLander-v2', + entry_point='gym.envs.box2d:LunarLander', + max_episode_steps=1000, + reward_threshold=200, +) + +register( + id='LunarLanderContinuous-v2', + entry_point='gym.envs.box2d:LunarLanderContinuous', + max_episode_steps=1000, + reward_threshold=200, +) + +register( + id='BipedalWalker-v2', + entry_point='gym.envs.box2d:BipedalWalker', + max_episode_steps=1600, + reward_threshold=300, +) + +register( + id='BipedalWalkerHardcore-v2', + entry_point='gym.envs.box2d:BipedalWalkerHardcore', + max_episode_steps=2000, + reward_threshold=300, +) + +register( + id='CarRacing-v0', + entry_point='gym.envs.box2d:CarRacing', + max_episode_steps=1000, + reward_threshold=900, +) + +# Toy Text +# ---------------------------------------- + +register( + id='Blackjack-v0', + entry_point='gym.envs.toy_text:BlackjackEnv', +) + +register( + id='KellyCoinflip-v0', + entry_point='gym.envs.toy_text:KellyCoinflipEnv', + reward_threshold=246.61, +) +register( + id='KellyCoinflipGeneralized-v0', + entry_point='gym.envs.toy_text:KellyCoinflipGeneralizedEnv', +) + +register( + id='FrozenLake-v0', + entry_point='gym.envs.toy_text:FrozenLakeEnv', + kwargs={'map_name': '4x4'}, + max_episode_steps=100, + reward_threshold=0.78, # optimum = .8196 +) + +register( + id='FrozenLake8x8-v0', + entry_point='gym.envs.toy_text:FrozenLakeEnv', + kwargs={'map_name': '8x8'}, + max_episode_steps=200, + reward_threshold=0.99, # optimum = 1 +) + +register( + id='CliffWalking-v0', + entry_point='gym.envs.toy_text:CliffWalkingEnv', +) + +register( + id='NChain-v0', + entry_point='gym.envs.toy_text:NChainEnv', + max_episode_steps=1000, +) + +register( + id='Roulette-v0', + entry_point='gym.envs.toy_text:RouletteEnv', + max_episode_steps=100, +) + +register( + id='Taxi-v2', + entry_point='gym.envs.toy_text.taxi:TaxiEnv', + reward_threshold=8, # optimum = 8.46 + max_episode_steps=200, +) + +register( + id='GuessingGame-v0', + entry_point='gym.envs.toy_text.guessing_game:GuessingGame', + max_episode_steps=200, +) + +register( + id='HotterColder-v0', + entry_point='gym.envs.toy_text.hotter_colder:HotterColder', + max_episode_steps=200, +) + +# Mujoco +# ---------------------------------------- + +# 2D + +register( + id='Reacher-v2', + entry_point='gym.envs.mujoco:ReacherEnv', + max_episode_steps=50, + reward_threshold=-3.75, +) + +register( + id='Pusher-v2', + entry_point='gym.envs.mujoco:PusherEnv', + max_episode_steps=100, + reward_threshold=0.0, +) + +register( + id='Thrower-v2', + entry_point='gym.envs.mujoco:ThrowerEnv', + max_episode_steps=100, + reward_threshold=0.0, +) + +register( + id='Striker-v2', + entry_point='gym.envs.mujoco:StrikerEnv', + max_episode_steps=100, + reward_threshold=0.0, +) + +register( + id='InvertedPendulum-v2', + entry_point='gym.envs.mujoco:InvertedPendulumEnv', + max_episode_steps=1000, + reward_threshold=950.0, +) + +register( + id='InvertedDoublePendulum-v2', + entry_point='gym.envs.mujoco:InvertedDoublePendulumEnv', + max_episode_steps=1000, + reward_threshold=9100.0, +) + +register( + id='HalfCheetah-v2', + entry_point='gym.envs.mujoco:HalfCheetahEnv', + max_episode_steps=1000, + reward_threshold=4800.0, +) + +register( + id='Hopper-v2', + entry_point='gym.envs.mujoco:HopperEnv', + max_episode_steps=1000, + reward_threshold=3800.0, +) + +register( + id='Swimmer-v2', + entry_point='gym.envs.mujoco:SwimmerEnv', + max_episode_steps=1000, + reward_threshold=360.0, +) + +register( + id='Walker2d-v2', + max_episode_steps=1000, + entry_point='gym.envs.mujoco:Walker2dEnv', +) + +register( + id='Ant-v2', + entry_point='gym.envs.mujoco:AntEnv', + max_episode_steps=1000, + reward_threshold=6000.0, +) + +register( + id='Humanoid-v2', + entry_point='gym.envs.mujoco:HumanoidEnv', + max_episode_steps=1000, +) + +register( + id='HumanoidStandup-v2', + entry_point='gym.envs.mujoco:HumanoidStandupEnv', + max_episode_steps=1000, +) + + +# Robotics +# ---------------------------------------- + +def _merge(a, b): + a.update(b) + return a + + +for reward_type in ['sparse', 'dense']: + suffix = 'Dense' if reward_type == 'dense' else '' + kwargs = { + 'reward_type': reward_type, + } + + # Fetch + register( + id='FetchSlide{}-v1'.format(suffix), + entry_point='gym.envs.robotics:FetchSlideEnv', + kwargs=kwargs, + max_episode_steps=50, + ) + + register( + id='FetchPickAndPlace{}-v1'.format(suffix), + entry_point='gym.envs.robotics:FetchPickAndPlaceEnv', + kwargs=kwargs, + max_episode_steps=50, + ) + + register( + id='FetchReach{}-v1'.format(suffix), + entry_point='gym.envs.robotics:FetchReachEnv', + kwargs=kwargs, + max_episode_steps=50, + ) + + register( + id='FetchPush{}-v1'.format(suffix), + entry_point='gym.envs.robotics:FetchPushEnv', + kwargs=kwargs, + max_episode_steps=50, + ) + + # Hand + register( + id='HandReach{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandReachEnv', + kwargs=kwargs, + max_episode_steps=50, + ) + + register( + id='HandManipulateBlockRotateZ{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandBlockEnv', + kwargs=_merge({'target_position': 'ignore', 'target_rotation': 'z'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulateBlockRotateParallel{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandBlockEnv', + kwargs=_merge({'target_position': 'ignore', 'target_rotation': 'parallel'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulateBlockRotateXYZ{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandBlockEnv', + kwargs=_merge({'target_position': 'ignore', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulateBlockFull{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandBlockEnv', + kwargs=_merge({'target_position': 'random', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + # Alias for "Full" + register( + id='HandManipulateBlock{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandBlockEnv', + kwargs=_merge({'target_position': 'random', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulateEggRotate{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandEggEnv', + kwargs=_merge({'target_position': 'ignore', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulateEggFull{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandEggEnv', + kwargs=_merge({'target_position': 'random', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + # Alias for "Full" + register( + id='HandManipulateEgg{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandEggEnv', + kwargs=_merge({'target_position': 'random', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulatePenRotate{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandPenEnv', + kwargs=_merge({'target_position': 'ignore', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + register( + id='HandManipulatePenFull{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandPenEnv', + kwargs=_merge({'target_position': 'random', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + + # Alias for "Full" + register( + id='HandManipulatePen{}-v0'.format(suffix), + entry_point='gym.envs.robotics:HandPenEnv', + kwargs=_merge({'target_position': 'random', 'target_rotation': 'xyz'}, kwargs), + max_episode_steps=100, + ) + +# Atari +# ---------------------------------------- + +# # print ', '.join(["'{}'".format(name.split('.')[0]) for name in atari_py.list_games()]) +for game in ['air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis', + 'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival', + 'centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk', + 'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar', + 'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kangaroo', 'krull', 'kung_fu_master', + 'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan', + 'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing', + 'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down', + 'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']: + for obs_type in ['image', 'ram']: + # space_invaders should yield SpaceInvaders-v0 and SpaceInvaders-ram-v0 + name = ''.join([g.capitalize() for g in game.split('_')]) + if obs_type == 'ram': + name = '{}-ram'.format(name) + + nondeterministic = False + if game == 'elevator_action' and obs_type == 'ram': + # ElevatorAction-ram-v0 seems to yield slightly + # non-deterministic observations about 10% of the time. We + # should track this down eventually, but for now we just + # mark it as nondeterministic. + nondeterministic = True + + register( + id='{}-v0'.format(name), + entry_point='gym.envs.atari:AtariEnv', + kwargs={'game': game, 'obs_type': obs_type, 'repeat_action_probability': 0.25}, + max_episode_steps=10000, + nondeterministic=nondeterministic, + ) + + register( + id='{}-v4'.format(name), + entry_point='gym.envs.atari:AtariEnv', + kwargs={'game': game, 'obs_type': obs_type}, + max_episode_steps=100000, + nondeterministic=nondeterministic, + ) + + # Standard Deterministic (as in the original DeepMind paper) + if game == 'space_invaders': + frameskip = 3 + else: + frameskip = 4 + + # Use a deterministic frame skip. + register( + id='{}Deterministic-v0'.format(name), + entry_point='gym.envs.atari:AtariEnv', + kwargs={'game': game, 'obs_type': obs_type, 'frameskip': frameskip, 'repeat_action_probability': 0.25}, + max_episode_steps=100000, + nondeterministic=nondeterministic, + ) + + register( + id='{}Deterministic-v4'.format(name), + entry_point='gym.envs.atari:AtariEnv', + kwargs={'game': game, 'obs_type': obs_type, 'frameskip': frameskip}, + max_episode_steps=100000, + nondeterministic=nondeterministic, + ) + + register( + id='{}NoFrameskip-v0'.format(name), + entry_point='gym.envs.atari:AtariEnv', + kwargs={'game': game, 'obs_type': obs_type, 'frameskip': 1, 'repeat_action_probability': 0.25}, + # A frameskip of 1 means we get every frame + max_episode_steps=frameskip * 100000, + nondeterministic=nondeterministic, + ) + + # No frameskip. (Atari has no entropy source, so these are + # deterministic environments.) + register( + id='{}NoFrameskip-v4'.format(name), + entry_point='gym.envs.atari:AtariEnv', + kwargs={'game': game, 'obs_type': obs_type, 'frameskip': 1}, # A frameskip of 1 means we get every frame + max_episode_steps=frameskip * 100000, + nondeterministic=nondeterministic, + ) + +# Unit test +# --------- + +register( + id='CubeCrash-v0', + entry_point='gym.envs.unittest:CubeCrash', + reward_threshold=0.9, +) +register( + id='CubeCrashSparse-v0', + entry_point='gym.envs.unittest:CubeCrashSparse', + reward_threshold=0.9, +) +register( + id='CubeCrashScreenBecomesBlack-v0', + entry_point='gym.envs.unittest:CubeCrashScreenBecomesBlack', + reward_threshold=0.9, +) + +register( + id='MemorizeDigits-v0', + entry_point='gym.envs.unittest:MemorizeDigits', + reward_threshold=20, +) + +register( + id='Pilesos-v0', + entry_point='gym.envs.pilesos:PilesosEnv', + max_episode_steps=5000, +) diff --git a/src/gym/envs/__pycache__/__init__.cpython-37.pyc b/src/gym/envs/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..b01c4eb Binary files /dev/null and b/src/gym/envs/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/envs/__pycache__/registration.cpython-37.pyc b/src/gym/envs/__pycache__/registration.cpython-37.pyc new file mode 100644 index 0000000..893c2f6 Binary files /dev/null and b/src/gym/envs/__pycache__/registration.cpython-37.pyc differ diff --git a/src/gym/envs/algorithmic/__init__.py b/src/gym/envs/algorithmic/__init__.py new file mode 100644 index 0000000..da5e719 --- /dev/null +++ b/src/gym/envs/algorithmic/__init__.py @@ -0,0 +1,5 @@ +from gym.envs.algorithmic.copy_ import CopyEnv +from gym.envs.algorithmic.repeat_copy import RepeatCopyEnv +from gym.envs.algorithmic.duplicated_input import DuplicatedInputEnv +from gym.envs.algorithmic.reverse import ReverseEnv +from gym.envs.algorithmic.reversed_addition import ReversedAdditionEnv diff --git a/src/gym/envs/algorithmic/algorithmic_env.py b/src/gym/envs/algorithmic/algorithmic_env.py new file mode 100644 index 0000000..a84520a --- /dev/null +++ b/src/gym/envs/algorithmic/algorithmic_env.py @@ -0,0 +1,326 @@ +""" +Algorithmic environments have the following traits in common: + +- A 1-d "input tape" or 2-d "input grid" of characters +- A target string which is a deterministic function of the input characters + +Agents control a read head that moves over the input tape. Observations consist +of the single character currently under the read head. The read head may fall +off the end of the tape in any direction. When this happens, agents will observe +a special blank character (with index=env.base) until they get back in bounds. + +Actions consist of 3 sub-actions: + - Direction to move the read head (left or right, plus up and down for 2-d envs) + - Whether to write to the output tape + - Which character to write (ignored if the above sub-action is 0) + +An episode ends when: + - The agent writes the full target string to the output tape. + - The agent writes an incorrect character. + - The agent runs out the time limit. (Which is fairly conservative.) + +Reward schedule: + write a correct character: +1 + write a wrong character: -.5 + run out the clock: -1 + otherwise: 0 + +In the beginning, input strings will be fairly short. After an environment has +been consistently solved over some window of episodes, the environment will +increase the average length of generated strings. Typical env specs require +leveling up many times to reach their reward threshold. +""" +from gym import Env, logger +from gym.spaces import Discrete, Tuple +from gym.utils import colorize, seeding +import numpy as np +from six import StringIO +import sys +import math + +class AlgorithmicEnv(Env): + + metadata = {'render.modes': ['human', 'ansi']} + # Only 'promote' the length of generated input strings if the worst of the + # last n episodes was no more than this far from the maximum reward + MIN_REWARD_SHORTFALL_FOR_PROMOTION = -1.0 + + def __init__(self, base=10, chars=False, starting_min_length=2): + """ + base: Number of distinct characters. + chars: If True, use uppercase alphabet. Otherwise, digits. Only affects + rendering. + starting_min_length: Minimum input string length. Ramps up as episodes + are consistently solved. + """ + self.base = base + # Keep track of this many past episodes + self.last = 10 + # Cumulative reward earned this episode + self.episode_total_reward = None + # Running tally of reward shortfalls. e.g. if there were 10 points to earn and + # we got 8, we'd append -2 + AlgorithmicEnv.reward_shortfalls = [] + if chars: + self.charmap = [chr(ord('A')+i) for i in range(base)] + else: + self.charmap = [str(i) for i in range(base)] + self.charmap.append(' ') + # TODO: Not clear why this is a class variable rather than instance. + # Could lead to some spooky action at a distance if someone is working + # with multiple algorithmic envs at once. Also makes testing tricky. + AlgorithmicEnv.min_length = starting_min_length + # Three sub-actions: + # 1. Move read head left or write (or up/down) + # 2. Write or not + # 3. Which character to write. (Ignored if should_write=0) + self.action_space = Tuple( + [Discrete(len(self.MOVEMENTS)), Discrete(2), Discrete(self.base)] + ) + # Can see just what is on the input tape (one of n characters, or nothing) + self.observation_space = Discrete(self.base + 1) + self.seed() + self.reset() + + @classmethod + def _movement_idx(kls, movement_name): + return kls.MOVEMENTS.index(movement_name) + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _get_obs(self, pos=None): + """Return an observation corresponding to the given read head position + (or the current read head position, if none is given).""" + raise NotImplemented + + def _get_str_obs(self, pos=None): + ret = self._get_obs(pos) + return self.charmap[ret] + + def _get_str_target(self, pos): + """Return the ith character of the target string (or " " if index + out of bounds).""" + if pos < 0 or len(self.target) <= pos: + return " " + else: + return self.charmap[self.target[pos]] + + def render_observation(self): + """Return a string representation of the input tape/grid.""" + raise NotImplementedError + + def render(self, mode='human'): + + outfile = StringIO() if mode == 'ansi' else sys.stdout + inp = "Total length of input instance: %d, step: %d\n" % (self.input_width, self.time) + outfile.write(inp) + x, y, action = self.read_head_position, self.write_head_position, self.last_action + if action is not None: + inp_act, out_act, pred = action + outfile.write("=" * (len(inp) - 1) + "\n") + y_str = "Output Tape : " + target_str = "Targets : " + if action is not None: + pred_str = self.charmap[pred] + x_str = self.render_observation() + for i in range(-2, len(self.target) + 2): + target_str += self._get_str_target(i) + if i < y - 1: + y_str += self._get_str_target(i) + elif i == (y - 1): + if action is not None and out_act == 1: + color = 'green' if pred == self.target[i] else 'red' + y_str += colorize(pred_str, color, highlight=True) + else: + y_str += self._get_str_target(i) + outfile.write(x_str) + outfile.write(y_str + "\n") + outfile.write(target_str + "\n\n") + + if action is not None: + outfile.write("Current reward : %.3f\n" % self.last_reward) + outfile.write("Cumulative reward : %.3f\n" % self.episode_total_reward) + move = self.MOVEMENTS[inp_act] + outfile.write("Action : Tuple(move over input: %s,\n" % move) + out_act = out_act == 1 + outfile.write(" write to the output tape: %s,\n" % out_act) + outfile.write(" prediction: %s)\n" % pred_str) + else: + outfile.write("\n" * 5) + return outfile + + @property + def input_width(self): + return len(self.input_data) + + def step(self, action): + assert self.action_space.contains(action) + self.last_action = action + inp_act, out_act, pred = action + done = False + reward = 0.0 + self.time += 1 + assert 0 <= self.write_head_position + if out_act == 1: + try: + correct = pred == self.target[self.write_head_position] + except IndexError: + logger.warn("It looks like you're calling step() even though this "+ + "environment has already returned done=True. You should always call "+ + "reset() once you receive done=True. Any further steps are undefined "+ + "behaviour.") + correct = False + if correct: + reward = 1.0 + else: + # Bail as soon as a wrong character is written to the tape + reward = -0.5 + done = True + self.write_head_position += 1 + if self.write_head_position >= len(self.target): + done = True + self._move(inp_act) + if self.time > self.time_limit: + reward = -1.0 + done = True + obs = self._get_obs() + self.last_reward = reward + self.episode_total_reward += reward + return (obs, reward, done, {}) + + @property + def time_limit(self): + """If an agent takes more than this many timesteps, end the episode + immediately and return a negative reward.""" + # (Seemingly arbitrary) + return self.input_width + len(self.target) + 4 + + def _check_levelup(self): + """Called between episodes. Update our running record of episode rewards + and, if appropriate, 'level up' minimum input length.""" + if self.episode_total_reward is None: + # This is before the first episode/call to reset(). Nothing to do + return + AlgorithmicEnv.reward_shortfalls.append(self.episode_total_reward - len(self.target)) + AlgorithmicEnv.reward_shortfalls = AlgorithmicEnv.reward_shortfalls[-self.last:] + if len(AlgorithmicEnv.reward_shortfalls) == self.last and \ + min(AlgorithmicEnv.reward_shortfalls) >= self.MIN_REWARD_SHORTFALL_FOR_PROMOTION and \ + AlgorithmicEnv.min_length < 30: + AlgorithmicEnv.min_length += 1 + AlgorithmicEnv.reward_shortfalls = [] + + + def reset(self): + self._check_levelup() + self.last_action = None + self.last_reward = 0 + self.read_head_position = self.READ_HEAD_START + self.write_head_position = 0 + self.episode_total_reward = 0.0 + self.time = 0 + length = self.np_random.randint(3) + AlgorithmicEnv.min_length + self.input_data = self.generate_input_data(length) + self.target = self.target_from_input_data(self.input_data) + return self._get_obs() + + def generate_input_data(self, size): + raise NotImplemented + + def target_from_input_data(self, input_data): + raise NotImplemented("Subclasses must implement") + + def _move(self, movement): + raise NotImplemented + +class TapeAlgorithmicEnv(AlgorithmicEnv): + """An algorithmic env with a 1-d input tape.""" + MOVEMENTS = ['left', 'right'] + READ_HEAD_START = 0 + + def _move(self, movement): + named = self.MOVEMENTS[movement] + self.read_head_position += 1 if named == 'right' else -1 + + def _get_obs(self, pos=None): + if pos is None: + pos = self.read_head_position + if pos < 0: + return self.base + if isinstance(pos, np.ndarray): + pos = pos.item() + try: + return self.input_data[pos] + except IndexError: + return self.base + + def generate_input_data(self, size): + return [self.np_random.randint(self.base) for _ in range(size)] + + def render_observation(self): + x = self.read_head_position + x_str = "Observation Tape : " + for i in range(-2, self.input_width + 2): + if i == x: + x_str += colorize(self._get_str_obs(np.array([i])), 'green', highlight=True) + else: + x_str += self._get_str_obs(np.array([i])) + x_str += "\n" + return x_str + +class GridAlgorithmicEnv(AlgorithmicEnv): + """An algorithmic env with a 2-d input grid.""" + MOVEMENTS = ['left', 'right', 'up', 'down'] + READ_HEAD_START = (0, 0) + def __init__(self, rows, *args, **kwargs): + self.rows = rows + AlgorithmicEnv.__init__(self, *args, **kwargs) + + def _move(self, movement): + named = self.MOVEMENTS[movement] + x, y = self.read_head_position + if named == 'left': + x -= 1 + elif named == 'right': + x += 1 + elif named == 'up': + y -= 1 + elif named == 'down': + y += 1 + else: + raise ValueError("Unrecognized direction: {}".format(named)) + self.read_head_position = x, y + + def generate_input_data(self, size): + return [ + [self.np_random.randint(self.base) for _ in range(self.rows)] + for __ in range(size) + ] + + def _get_obs(self, pos=None): + if pos is None: + pos = self.read_head_position + x, y = pos + if any(idx < 0 for idx in pos): + return self.base + try: + return self.input_data[x][y] + except IndexError: + return self.base + + def render_observation(self): + x = self.read_head_position + label = "Observation Grid : " + x_str = "" + for j in range(-1, self.rows+1): + if j != -1: + x_str += " " * len(label) + for i in range(-2, self.input_width + 2): + if i == x[0] and j == x[1]: + x_str += colorize(self._get_str_obs((i, j)), 'green', highlight=True) + else: + x_str += self._get_str_obs((i, j)) + x_str += "\n" + x_str = label + x_str + return x_str diff --git a/src/gym/envs/algorithmic/copy_.py b/src/gym/envs/algorithmic/copy_.py new file mode 100644 index 0000000..7c6dfdf --- /dev/null +++ b/src/gym/envs/algorithmic/copy_.py @@ -0,0 +1,13 @@ +""" +Task is to copy content from the input tape to +the output tape. http://arxiv.org/abs/1511.07275 +""" +from gym.envs.algorithmic import algorithmic_env + +class CopyEnv(algorithmic_env.TapeAlgorithmicEnv): + def __init__(self, base=5, chars=True): + super(CopyEnv, self).__init__(base=base, chars=chars) + + def target_from_input_data(self, input_data): + return input_data + diff --git a/src/gym/envs/algorithmic/duplicated_input.py b/src/gym/envs/algorithmic/duplicated_input.py new file mode 100644 index 0000000..d992814 --- /dev/null +++ b/src/gym/envs/algorithmic/duplicated_input.py @@ -0,0 +1,24 @@ +""" +Task is to return every nth character from the input tape. +http://arxiv.org/abs/1511.07275 +""" +from __future__ import division +from gym.envs.algorithmic import algorithmic_env + +class DuplicatedInputEnv(algorithmic_env.TapeAlgorithmicEnv): + def __init__(self, duplication=2, base=5): + self.duplication = duplication + super(DuplicatedInputEnv, self).__init__(base=base, chars=True) + + def generate_input_data(self, size): + res = [] + if size < self.duplication: + size = self.duplication + for i in range(size//self.duplication): + char = self.np_random.randint(self.base) + for _ in range(self.duplication): + res.append(char) + return res + + def target_from_input_data(self, input_data): + return [input_data[i] for i in range(0, len(input_data), self.duplication)] diff --git a/src/gym/envs/algorithmic/repeat_copy.py b/src/gym/envs/algorithmic/repeat_copy.py new file mode 100644 index 0000000..0c79322 --- /dev/null +++ b/src/gym/envs/algorithmic/repeat_copy.py @@ -0,0 +1,15 @@ +""" +Task is to copy content multiple times from the input tape to +the output tape. http://arxiv.org/abs/1511.07275 +""" +from gym.envs.algorithmic import algorithmic_env + +class RepeatCopyEnv(algorithmic_env.TapeAlgorithmicEnv): + MIN_REWARD_SHORTFALL_FOR_PROMOTION = -.1 + def __init__(self, base=5): + super(RepeatCopyEnv, self).__init__(base=base, chars=True) + self.last = 50 + + def target_from_input_data(self, input_data): + return input_data + list(reversed(input_data)) + input_data + diff --git a/src/gym/envs/algorithmic/reverse.py b/src/gym/envs/algorithmic/reverse.py new file mode 100644 index 0000000..489e4af --- /dev/null +++ b/src/gym/envs/algorithmic/reverse.py @@ -0,0 +1,15 @@ +""" +Task is to reverse content over the input tape. +http://arxiv.org/abs/1511.07275 +""" + +from gym.envs.algorithmic import algorithmic_env + +class ReverseEnv(algorithmic_env.TapeAlgorithmicEnv): + MIN_REWARD_SHORTFALL_FOR_PROMOTION = -.1 + def __init__(self, base=2): + super(ReverseEnv, self).__init__(base=base, chars=True, starting_min_length=1) + self.last = 50 + + def target_from_input_data(self, input_str): + return list(reversed(input_str)) diff --git a/src/gym/envs/algorithmic/reversed_addition.py b/src/gym/envs/algorithmic/reversed_addition.py new file mode 100644 index 0000000..eb5c790 --- /dev/null +++ b/src/gym/envs/algorithmic/reversed_addition.py @@ -0,0 +1,30 @@ +from __future__ import division +import numpy as np +from gym.envs.algorithmic import algorithmic_env + +class ReversedAdditionEnv(algorithmic_env.GridAlgorithmicEnv): + def __init__(self, rows=2, base=3): + super(ReversedAdditionEnv, self).__init__(rows=rows, base=base, chars=False) + + def target_from_input_data(self, input_strings): + curry = 0 + target = [] + for digits in input_strings: + total = sum(digits) + curry + target.append(total % self.base) + curry = total // self.base + + if curry > 0: + target.append(curry) + return target + + @property + def time_limit(self): + # Quirk preserved for the sake of consistency: add the length of the input + # rather than the length of the desired output (which may differ if there's + # an extra carried digit). + # TODO: It seems like this time limit is so strict as to make Addition3-v0 + # unsolvable, since agents aren't even given enough time steps to look at + # all the digits. (The solutions on the scoreboard seem to only work by + # save-scumming.) + return self.input_width*2 + 4 diff --git a/src/gym/envs/algorithmic/tests/__init__.py b/src/gym/envs/algorithmic/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/envs/algorithmic/tests/test_algorithmic.py b/src/gym/envs/algorithmic/tests/test_algorithmic.py new file mode 100644 index 0000000..7857f05 --- /dev/null +++ b/src/gym/envs/algorithmic/tests/test_algorithmic.py @@ -0,0 +1,239 @@ +from gym.envs import algorithmic as alg +import unittest + +# All concrete subclasses of AlgorithmicEnv +ALL_ENVS = [ + alg.copy_.CopyEnv, + alg.duplicated_input.DuplicatedInputEnv, + alg.repeat_copy.RepeatCopyEnv, + alg.reverse.ReverseEnv, + alg.reversed_addition.ReversedAdditionEnv, +] +ALL_TAPE_ENVS = [env for env in ALL_ENVS + if issubclass(env, alg.algorithmic_env.TapeAlgorithmicEnv)] +ALL_GRID_ENVS = [env for env in ALL_ENVS + if issubclass(env, alg.algorithmic_env.GridAlgorithmicEnv)] + +def imprint(env, input_arr): + """Monkey-patch the given environment so that when reset() is called, the + input tape/grid will be set to the given data, rather than being randomly + generated.""" + env.generate_input_data = lambda _: input_arr + +class TestAlgorithmicEnvInteractions(unittest.TestCase): + """Test some generic behaviour not specific to any particular algorithmic + environment. Movement, allocation of rewards, etc.""" + CANNED_INPUT = [0, 1] + ENV_KLS = alg.copy_.CopyEnv + LEFT, RIGHT = ENV_KLS._movement_idx('left'), ENV_KLS._movement_idx('right') + def setUp(self): + self.env = self.ENV_KLS(base=2, chars=True) + imprint(self.env, self.CANNED_INPUT) + + def test_successful_interaction(self): + obs = self.env.reset() + self.assertEqual(obs, 0) + obs, reward, done, _ = self.env.step([self.RIGHT, 1, 0]) + self.assertEqual(obs, 1) + self.assertGreater(reward, 0) + self.assertFalse(done) + obs, reward, done, _ = self.env.step([self.LEFT, 1, 1]) + self.assertTrue(done) + self.assertGreater(reward, 0) + + def test_bad_output_fail_fast(self): + obs = self.env.reset() + obs, reward, done, _ = self.env.step([self.RIGHT, 1, 1]) + self.assertTrue(done) + self.assertLess(reward, 0) + + def test_levelup(self): + obs = self.env.reset() + # Kind of a hack + alg.algorithmic_env.AlgorithmicEnv.reward_shortfalls = [] + min_length = self.env.min_length + for i in range(self.env.last): + obs, reward, done, _ = self.env.step([self.RIGHT, 1, 0]) + self.assertFalse(done) + obs, reward, done, _ = self.env.step([self.RIGHT, 1, 1]) + self.assertTrue(done) + self.env.reset() + if i < self.env.last-1: + self.assertEqual(len(alg.algorithmic_env.AlgorithmicEnv.reward_shortfalls), i+1) + else: + # Should have leveled up on the last iteration + self.assertEqual(self.env.min_length, min_length+1) + self.assertEqual(len(alg.algorithmic_env.AlgorithmicEnv.reward_shortfalls), 0) + + def test_walk_off_the_end(self): + obs = self.env.reset() + # Walk off the end + obs, r, done, _ = self.env.step([self.LEFT, 0, 0]) + self.assertEqual(obs, self.env.base) + self.assertEqual(r, 0) + self.assertFalse(done) + # Walk further off track + obs, r, done, _ = self.env.step([self.LEFT, 0, 0]) + self.assertEqual(obs, self.env.base) + self.assertFalse(done) + # Return to the first input character + obs, r, done, _ = self.env.step([self.RIGHT, 0, 0]) + self.assertEqual(obs, self.env.base) + self.assertFalse(done) + obs, r, done, _ = self.env.step([self.RIGHT, 0, 0]) + self.assertEqual(obs, 0) + + def test_grid_naviation(self): + env = alg.reversed_addition.ReversedAdditionEnv(rows=2, base=6) + N,S,E,W = [env._movement_idx(named_dir) for named_dir in ['up', 'down', 'right', 'left']] + # Corresponds to a grid that looks like... + # 0 1 2 + # 3 4 5 + canned = [ [0, 3], [1, 4], [2, 5] ] + imprint(env, canned) + obs = env.reset() + self.assertEqual(obs, 0) + navigation = [ + (S, 3), (N, 0), (E, 1), (S, 4), (S, 6), (E, 6), (N, 5), (N, 2), (W, 1) + ] + for (movement, expected_obs) in navigation: + obs, reward, done, _ = env.step([movement, 0, 0]) + self.assertEqual(reward, 0) + self.assertFalse(done) + self.assertEqual(obs, expected_obs) + + def test_grid_success(self): + env = alg.reversed_addition.ReversedAdditionEnv(rows=2, base=3) + canned = [ [1, 2], [1, 0], [2, 2] ] + imprint(env, canned) + obs = env.reset() + target = [0, 2, 1, 1] + self.assertEqual(env.target, target) + self.assertEqual(obs, 1) + for i, target_digit in enumerate(target): + obs, reward, done, _ = env.step([0, 1, target_digit]) + self.assertGreater(reward, 0) + self.assertEqual(done, i==len(target)-1) + + def test_sane_time_limit(self): + obs = self.env.reset() + self.assertLess(self.env.time_limit, 100) + for _ in range(100): + obs, r, done, _ = self.env.step([self.LEFT, 0, 0]) + if done: + return + self.fail("Time limit wasn't enforced") + + def test_rendering(self): + env = self.env + obs = env.reset() + self.assertEqual(env._get_str_obs(), 'A') + self.assertEqual(env._get_str_obs(1), 'B') + self.assertEqual(env._get_str_obs(-1), ' ') + self.assertEqual(env._get_str_obs(2), ' ') + self.assertEqual(env._get_str_target(0), 'A') + self.assertEqual(env._get_str_target(1), 'B') + # Test numerical alphabet rendering + env = self.ENV_KLS(base=3, chars=False) + imprint(env, self.CANNED_INPUT) + env.reset() + self.assertEqual(env._get_str_obs(), '0') + self.assertEqual(env._get_str_obs(1), '1') + + +class TestTargets(unittest.TestCase): + """Test the rules mapping input strings/grids to target outputs.""" + def test_reverse_target(self): + input_expected = [ + ([0], [0]), + ([0, 1], [1, 0]), + ([1, 1], [1, 1]), + ([1, 0, 1], [1, 0, 1]), + ([0, 0, 1, 1], [1, 1, 0, 0]), + ] + env = alg.reverse.ReverseEnv() + for input_arr, expected in input_expected: + target = env.target_from_input_data(input_arr) + self.assertEqual(target, expected) + + def test_reversed_addition_target(self): + env = alg.reversed_addition.ReversedAdditionEnv(base=3) + input_expected = [ + ([[1,1], [1,1]], [2, 2]), + ([[2,2], [0,1]], [1, 2]), + ([[2,1], [1,1], [1,1], [1,0]], [0, 0, 0, 2]), + ] + for (input_grid, expected_target) in input_expected: + self.assertEqual(env.target_from_input_data(input_grid), expected_target) + + def test_reversed_addition_3rows(self): + env = alg.reversed_addition.ReversedAdditionEnv(base=3, rows=3) + input_expected = [ + ([[1,1,0],[0,1,1]], [2, 2]), + ([[1,1,2],[0,1,1]], [1,0,1]), + ] + for (input_grid, expected_target) in input_expected: + self.assertEqual(env.target_from_input_data(input_grid), expected_target) + + def test_copy_target(self): + env = alg.copy_.CopyEnv() + self.assertEqual(env.target_from_input_data([0, 1, 2]), [0, 1, 2]) + + def test_duplicated_input_target(self): + env = alg.duplicated_input.DuplicatedInputEnv(duplication=2) + self.assertEqual(env.target_from_input_data([0, 0, 0, 0, 1, 1]), [0, 0, 1]) + + def test_repeat_copy_target(self): + env = alg.repeat_copy.RepeatCopyEnv() + self.assertEqual(env.target_from_input_data([0, 1, 2]), [0, 1, 2, 2, 1, 0, 0, 1, 2]) + +class TestInputGeneration(unittest.TestCase): + """Test random input generation. + """ + def test_tape_inputs(self): + for env_kls in ALL_TAPE_ENVS: + env = env_kls() + for size in range(2,5): + input_tape = env.generate_input_data(size) + self.assertTrue(all(0<=x<=env.base for x in input_tape), + "Invalid input tape from env {}: {}".format(env_kls, input_tape)) + # DuplicatedInput needs to generate inputs with even length, + # so it may be short one + self.assertLessEqual(len(input_tape), size) + + def test_grid_inputs(self): + for env_kls in ALL_GRID_ENVS: + env = env_kls() + for size in range(2, 5): + input_grid = env.generate_input_data(size) + # Should get "size" sublists, each of length self.rows (not the + # opposite, as you might expect) + self.assertEqual(len(input_grid), size) + self.assertTrue(all(len(col) == env.rows for col in input_grid)) + self.assertTrue(all(0<=x<=env.base for x in input_grid[0])) + + def test_duplicatedinput_inputs(self): + """The duplicated_input env needs to generate strings with the appropriate + amount of repetiion.""" + env = alg.duplicated_input.DuplicatedInputEnv(duplication=2) + input_tape = env.generate_input_data(4) + self.assertEqual(len(input_tape), 4) + self.assertEqual(input_tape[0], input_tape[1]) + self.assertEqual(input_tape[2], input_tape[3]) + # If requested input size isn't a multiple of duplication, go lower + input_tape = env.generate_input_data(3) + self.assertEqual(len(input_tape), 2) + self.assertEqual(input_tape[0], input_tape[1]) + # If requested input size is *less than* duplication, go up + input_tape = env.generate_input_data(1) + self.assertEqual(len(input_tape), 2) + self.assertEqual(input_tape[0], input_tape[1]) + + env = alg.duplicated_input.DuplicatedInputEnv(duplication=3) + input_tape = env.generate_input_data(6) + self.assertEqual(len(input_tape), 6) + self.assertEqual(input_tape[0], input_tape[1]) + self.assertEqual(input_tape[1], input_tape[2]) + +if __name__ == '__main__': + unittest.main() diff --git a/src/gym/envs/atari/__init__.py b/src/gym/envs/atari/__init__.py new file mode 100644 index 0000000..351106e --- /dev/null +++ b/src/gym/envs/atari/__init__.py @@ -0,0 +1 @@ +from gym.envs.atari.atari_env import AtariEnv diff --git a/src/gym/envs/atari/atari_env.py b/src/gym/envs/atari/atari_env.py new file mode 100644 index 0000000..14122df --- /dev/null +++ b/src/gym/envs/atari/atari_env.py @@ -0,0 +1,192 @@ +import numpy as np +import os +import gym +from gym import error, spaces +from gym import utils +from gym.utils import seeding + +try: + import atari_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you can install Atari dependencies by running 'pip install gym[atari]'.)".format(e)) + +def to_ram(ale): + ram_size = ale.getRAMSize() + ram = np.zeros((ram_size),dtype=np.uint8) + ale.getRAM(ram) + return ram + +class AtariEnv(gym.Env, utils.EzPickle): + metadata = {'render.modes': ['human', 'rgb_array']} + + def __init__(self, game='pong', obs_type='ram', frameskip=(2, 5), repeat_action_probability=0.): + """Frameskip should be either a tuple (indicating a random range to + choose from, with the top value exclude), or an int.""" + + utils.EzPickle.__init__(self, game, obs_type, frameskip, repeat_action_probability) + assert obs_type in ('ram', 'image') + + self.game_path = atari_py.get_game_path(game) + if not os.path.exists(self.game_path): + raise IOError('You asked for game %s but path %s does not exist'%(game, self.game_path)) + self._obs_type = obs_type + self.frameskip = frameskip + self.ale = atari_py.ALEInterface() + self.viewer = None + + # Tune (or disable) ALE's action repeat: + # https://github.com/openai/gym/issues/349 + assert isinstance(repeat_action_probability, (float, int)), "Invalid repeat_action_probability: {!r}".format(repeat_action_probability) + self.ale.setFloat('repeat_action_probability'.encode('utf-8'), repeat_action_probability) + + self.seed() + + self._action_set = self.ale.getMinimalActionSet() + self.action_space = spaces.Discrete(len(self._action_set)) + + (screen_width,screen_height) = self.ale.getScreenDims() + if self._obs_type == 'ram': + self.observation_space = spaces.Box(low=0, high=255, dtype=np.uint8, shape=(128,)) + elif self._obs_type == 'image': + self.observation_space = spaces.Box(low=0, high=255, shape=(screen_height, screen_width, 3), dtype=np.uint8) + else: + raise error.Error('Unrecognized observation type: {}'.format(self._obs_type)) + + def seed(self, seed=None): + self.np_random, seed1 = seeding.np_random(seed) + # Derive a random seed. This gets passed as a uint, but gets + # checked as an int elsewhere, so we need to keep it below + # 2**31. + seed2 = seeding.hash_seed(seed1 + 1) % 2**31 + # Empirically, we need to seed before loading the ROM. + self.ale.setInt(b'random_seed', seed2) + self.ale.loadROM(self.game_path) + return [seed1, seed2] + + def step(self, a): + reward = 0.0 + action = self._action_set[a] + + if isinstance(self.frameskip, int): + num_steps = self.frameskip + else: + num_steps = self.np_random.randint(self.frameskip[0], self.frameskip[1]) + for _ in range(num_steps): + reward += self.ale.act(action) + ob = self._get_obs() + + return ob, reward, self.ale.game_over(), {"ale.lives": self.ale.lives()} + + def _get_image(self): + return self.ale.getScreenRGB2() + + def _get_ram(self): + return to_ram(self.ale) + + @property + def _n_actions(self): + return len(self._action_set) + + def _get_obs(self): + if self._obs_type == 'ram': + return self._get_ram() + elif self._obs_type == 'image': + img = self._get_image() + return img + + # return: (states, observations) + def reset(self): + self.ale.reset_game() + return self._get_obs() + + def render(self, mode='human'): + img = self._get_image() + if mode == 'rgb_array': + return img + elif mode == 'human': + from gym.envs.classic_control import rendering + if self.viewer is None: + self.viewer = rendering.SimpleImageViewer() + self.viewer.imshow(img) + return self.viewer.isopen + + def close(self): + if self.viewer is not None: + self.viewer.close() + self.viewer = None + + def get_action_meanings(self): + return [ACTION_MEANING[i] for i in self._action_set] + + def get_keys_to_action(self): + KEYWORD_TO_KEY = { + 'UP': ord('w'), + 'DOWN': ord('s'), + 'LEFT': ord('a'), + 'RIGHT': ord('d'), + 'FIRE': ord(' '), + } + + keys_to_action = {} + + for action_id, action_meaning in enumerate(self.get_action_meanings()): + keys = [] + for keyword, key in KEYWORD_TO_KEY.items(): + if keyword in action_meaning: + keys.append(key) + keys = tuple(sorted(keys)) + + assert keys not in keys_to_action + keys_to_action[keys] = action_id + + return keys_to_action + + def clone_state(self): + """Clone emulator state w/o system state. Restoring this state will + *not* give an identical environment. For complete cloning and restoring + of the full state, see `{clone,restore}_full_state()`.""" + state_ref = self.ale.cloneState() + state = self.ale.encodeState(state_ref) + self.ale.deleteState(state_ref) + return state + + def restore_state(self, state): + """Restore emulator state w/o system state.""" + state_ref = self.ale.decodeState(state) + self.ale.restoreState(state_ref) + self.ale.deleteState(state_ref) + + def clone_full_state(self): + """Clone emulator state w/ system state including pseudorandomness. + Restoring this state will give an identical environment.""" + state_ref = self.ale.cloneSystemState() + state = self.ale.encodeState(state_ref) + self.ale.deleteState(state_ref) + return state + + def restore_full_state(self, state): + """Restore emulator state w/ system state including pseudorandomness.""" + state_ref = self.ale.decodeState(state) + self.ale.restoreSystemState(state_ref) + self.ale.deleteState(state_ref) + +ACTION_MEANING = { + 0 : "NOOP", + 1 : "FIRE", + 2 : "UP", + 3 : "RIGHT", + 4 : "LEFT", + 5 : "DOWN", + 6 : "UPRIGHT", + 7 : "UPLEFT", + 8 : "DOWNRIGHT", + 9 : "DOWNLEFT", + 10 : "UPFIRE", + 11 : "RIGHTFIRE", + 12 : "LEFTFIRE", + 13 : "DOWNFIRE", + 14 : "UPRIGHTFIRE", + 15 : "UPLEFTFIRE", + 16 : "DOWNRIGHTFIRE", + 17 : "DOWNLEFTFIRE", +} diff --git a/src/gym/envs/box2d/__init__.py b/src/gym/envs/box2d/__init__.py new file mode 100644 index 0000000..725f319 --- /dev/null +++ b/src/gym/envs/box2d/__init__.py @@ -0,0 +1,4 @@ +from gym.envs.box2d.lunar_lander import LunarLander +from gym.envs.box2d.lunar_lander import LunarLanderContinuous +from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore +from gym.envs.box2d.car_racing import CarRacing diff --git a/src/gym/envs/box2d/bipedal_walker.py b/src/gym/envs/box2d/bipedal_walker.py new file mode 100644 index 0000000..d4c3e00 --- /dev/null +++ b/src/gym/envs/box2d/bipedal_walker.py @@ -0,0 +1,581 @@ +import sys, math +import numpy as np + +import Box2D +from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) + +import gym +from gym import spaces +from gym.utils import colorize, seeding, EzPickle + +# This is simple 4-joints walker robot environment. +# +# There are two versions: +# +# - Normal, with slightly uneven terrain. +# +# - Hardcore with ladders, stumps, pitfalls. +# +# Reward is given for moving forward, total 300+ points up to the far end. If the robot falls, +# it gets -100. Applying motor torque costs a small amount of points, more optimal agent +# will get better score. +# +# Heuristic is provided for testing, it's also useful to get demonstrations to +# learn from. To run heuristic: +# +# python gym/envs/box2d/bipedal_walker.py +# +# State consists of hull angle speed, angular velocity, horizontal speed, vertical speed, +# position of joints and joints angular speed, legs contact with ground, and 10 lidar +# rangefinder measurements to help to deal with the hardcore version. There's no coordinates +# in the state vector. Lidar is less useful in normal version, but it works. +# +# To solve the game you need to get 300 points in 1600 time steps. +# +# To solve hardcore version you need 300 points in 2000 time steps. +# +# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. + +FPS = 50 +SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well + +MOTORS_TORQUE = 80 +SPEED_HIP = 4 +SPEED_KNEE = 6 +LIDAR_RANGE = 160/SCALE + +INITIAL_RANDOM = 5 + +HULL_POLY =[ + (-30,+9), (+6,+9), (+34,+1), + (+34,-8), (-30,-8) + ] +LEG_DOWN = -8/SCALE +LEG_W, LEG_H = 8/SCALE, 34/SCALE + +VIEWPORT_W = 600 +VIEWPORT_H = 400 + +TERRAIN_STEP = 14/SCALE +TERRAIN_LENGTH = 200 # in steps +TERRAIN_HEIGHT = VIEWPORT_H/SCALE/4 +TERRAIN_GRASS = 10 # low long are grass spots, in steps +TERRAIN_STARTPAD = 20 # in steps +FRICTION = 2.5 + +HULL_FD = fixtureDef( + shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in HULL_POLY ]), + density=5.0, + friction=0.1, + categoryBits=0x0020, + maskBits=0x001, # collide only with ground + restitution=0.0) # 0.99 bouncy + +LEG_FD = fixtureDef( + shape=polygonShape(box=(LEG_W/2, LEG_H/2)), + density=1.0, + restitution=0.0, + categoryBits=0x0020, + maskBits=0x001) + +LOWER_FD = fixtureDef( + shape=polygonShape(box=(0.8*LEG_W/2, LEG_H/2)), + density=1.0, + restitution=0.0, + categoryBits=0x0020, + maskBits=0x001) + +class ContactDetector(contactListener): + def __init__(self, env): + contactListener.__init__(self) + self.env = env + def BeginContact(self, contact): + if self.env.hull==contact.fixtureA.body or self.env.hull==contact.fixtureB.body: + self.env.game_over = True + for leg in [self.env.legs[1], self.env.legs[3]]: + if leg in [contact.fixtureA.body, contact.fixtureB.body]: + leg.ground_contact = True + def EndContact(self, contact): + for leg in [self.env.legs[1], self.env.legs[3]]: + if leg in [contact.fixtureA.body, contact.fixtureB.body]: + leg.ground_contact = False + +class BipedalWalker(gym.Env, EzPickle): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : FPS + } + + hardcore = False + + def __init__(self): + EzPickle.__init__(self) + self.seed() + self.viewer = None + + self.world = Box2D.b2World() + self.terrain = None + self.hull = None + + self.prev_shaping = None + + self.fd_polygon = fixtureDef( + shape = polygonShape(vertices= + [(0, 0), + (1, 0), + (1, -1), + (0, -1)]), + friction = FRICTION) + + self.fd_edge = fixtureDef( + shape = edgeShape(vertices= + [(0, 0), + (1, 1)]), + friction = FRICTION, + categoryBits=0x0001, + ) + + self.reset() + + high = np.array([np.inf]*24) + self.action_space = spaces.Box(np.array([-1,-1,-1,-1]), np.array([+1,+1,+1,+1])) + self.observation_space = spaces.Box(-high, high) + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _destroy(self): + if not self.terrain: return + self.world.contactListener = None + for t in self.terrain: + self.world.DestroyBody(t) + self.terrain = [] + self.world.DestroyBody(self.hull) + self.hull = None + for leg in self.legs: + self.world.DestroyBody(leg) + self.legs = [] + self.joints = [] + + def _generate_terrain(self, hardcore): + GRASS, STUMP, STAIRS, PIT, _STATES_ = range(5) + state = GRASS + velocity = 0.0 + y = TERRAIN_HEIGHT + counter = TERRAIN_STARTPAD + oneshot = False + self.terrain = [] + self.terrain_x = [] + self.terrain_y = [] + for i in range(TERRAIN_LENGTH): + x = i*TERRAIN_STEP + self.terrain_x.append(x) + + if state==GRASS and not oneshot: + velocity = 0.8*velocity + 0.01*np.sign(TERRAIN_HEIGHT - y) + if i > TERRAIN_STARTPAD: velocity += self.np_random.uniform(-1, 1)/SCALE #1 + y += velocity + + elif state==PIT and oneshot: + counter = self.np_random.randint(3, 5) + poly = [ + (x, y), + (x+TERRAIN_STEP, y), + (x+TERRAIN_STEP, y-4*TERRAIN_STEP), + (x, y-4*TERRAIN_STEP), + ] + self.fd_polygon.shape.vertices=poly + t = self.world.CreateStaticBody( + fixtures = self.fd_polygon) + t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6) + self.terrain.append(t) + + self.fd_polygon.shape.vertices=[(p[0]+TERRAIN_STEP*counter,p[1]) for p in poly] + t = self.world.CreateStaticBody( + fixtures = self.fd_polygon) + t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6) + self.terrain.append(t) + counter += 2 + original_y = y + + elif state==PIT and not oneshot: + y = original_y + if counter > 1: + y -= 4*TERRAIN_STEP + + elif state==STUMP and oneshot: + counter = self.np_random.randint(1, 3) + poly = [ + (x, y), + (x+counter*TERRAIN_STEP, y), + (x+counter*TERRAIN_STEP, y+counter*TERRAIN_STEP), + (x, y+counter*TERRAIN_STEP), + ] + self.fd_polygon.shape.vertices=poly + t = self.world.CreateStaticBody( + fixtures = self.fd_polygon) + t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6) + self.terrain.append(t) + + elif state==STAIRS and oneshot: + stair_height = +1 if self.np_random.rand() > 0.5 else -1 + stair_width = self.np_random.randint(4, 5) + stair_steps = self.np_random.randint(3, 5) + original_y = y + for s in range(stair_steps): + poly = [ + (x+( s*stair_width)*TERRAIN_STEP, y+( s*stair_height)*TERRAIN_STEP), + (x+((1+s)*stair_width)*TERRAIN_STEP, y+( s*stair_height)*TERRAIN_STEP), + (x+((1+s)*stair_width)*TERRAIN_STEP, y+(-1+s*stair_height)*TERRAIN_STEP), + (x+( s*stair_width)*TERRAIN_STEP, y+(-1+s*stair_height)*TERRAIN_STEP), + ] + self.fd_polygon.shape.vertices=poly + t = self.world.CreateStaticBody( + fixtures = self.fd_polygon) + t.color1, t.color2 = (1,1,1), (0.6,0.6,0.6) + self.terrain.append(t) + counter = stair_steps*stair_width + + elif state==STAIRS and not oneshot: + s = stair_steps*stair_width - counter - stair_height + n = s/stair_width + y = original_y + (n*stair_height)*TERRAIN_STEP + + oneshot = False + self.terrain_y.append(y) + counter -= 1 + if counter==0: + counter = self.np_random.randint(TERRAIN_GRASS/2, TERRAIN_GRASS) + if state==GRASS and hardcore: + state = self.np_random.randint(1, _STATES_) + oneshot = True + else: + state = GRASS + oneshot = True + + self.terrain_poly = [] + for i in range(TERRAIN_LENGTH-1): + poly = [ + (self.terrain_x[i], self.terrain_y[i]), + (self.terrain_x[i+1], self.terrain_y[i+1]) + ] + self.fd_edge.shape.vertices=poly + t = self.world.CreateStaticBody( + fixtures = self.fd_edge) + color = (0.3, 1.0 if i%2==0 else 0.8, 0.3) + t.color1 = color + t.color2 = color + self.terrain.append(t) + color = (0.4, 0.6, 0.3) + poly += [ (poly[1][0], 0), (poly[0][0], 0) ] + self.terrain_poly.append( (poly, color) ) + self.terrain.reverse() + + def _generate_clouds(self): + # Sorry for the clouds, couldn't resist + self.cloud_poly = [] + for i in range(TERRAIN_LENGTH//20): + x = self.np_random.uniform(0, TERRAIN_LENGTH)*TERRAIN_STEP + y = VIEWPORT_H/SCALE*3/4 + poly = [ + (x+15*TERRAIN_STEP*math.sin(3.14*2*a/5)+self.np_random.uniform(0,5*TERRAIN_STEP), + y+ 5*TERRAIN_STEP*math.cos(3.14*2*a/5)+self.np_random.uniform(0,5*TERRAIN_STEP) ) + for a in range(5) ] + x1 = min( [p[0] for p in poly] ) + x2 = max( [p[0] for p in poly] ) + self.cloud_poly.append( (poly,x1,x2) ) + + def reset(self): + self._destroy() + self.world.contactListener_bug_workaround = ContactDetector(self) + self.world.contactListener = self.world.contactListener_bug_workaround + self.game_over = False + self.prev_shaping = None + self.scroll = 0.0 + self.lidar_render = 0 + + W = VIEWPORT_W/SCALE + H = VIEWPORT_H/SCALE + + self._generate_terrain(self.hardcore) + self._generate_clouds() + + init_x = TERRAIN_STEP*TERRAIN_STARTPAD/2 + init_y = TERRAIN_HEIGHT+2*LEG_H + self.hull = self.world.CreateDynamicBody( + position = (init_x, init_y), + fixtures = HULL_FD + ) + self.hull.color1 = (0.5,0.4,0.9) + self.hull.color2 = (0.3,0.3,0.5) + self.hull.ApplyForceToCenter((self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), 0), True) + + self.legs = [] + self.joints = [] + for i in [-1,+1]: + leg = self.world.CreateDynamicBody( + position = (init_x, init_y - LEG_H/2 - LEG_DOWN), + angle = (i*0.05), + fixtures = LEG_FD + ) + leg.color1 = (0.6-i/10., 0.3-i/10., 0.5-i/10.) + leg.color2 = (0.4-i/10., 0.2-i/10., 0.3-i/10.) + rjd = revoluteJointDef( + bodyA=self.hull, + bodyB=leg, + localAnchorA=(0, LEG_DOWN), + localAnchorB=(0, LEG_H/2), + enableMotor=True, + enableLimit=True, + maxMotorTorque=MOTORS_TORQUE, + motorSpeed = i, + lowerAngle = -0.8, + upperAngle = 1.1, + ) + self.legs.append(leg) + self.joints.append(self.world.CreateJoint(rjd)) + + lower = self.world.CreateDynamicBody( + position = (init_x, init_y - LEG_H*3/2 - LEG_DOWN), + angle = (i*0.05), + fixtures = LOWER_FD + ) + lower.color1 = (0.6-i/10., 0.3-i/10., 0.5-i/10.) + lower.color2 = (0.4-i/10., 0.2-i/10., 0.3-i/10.) + rjd = revoluteJointDef( + bodyA=leg, + bodyB=lower, + localAnchorA=(0, -LEG_H/2), + localAnchorB=(0, LEG_H/2), + enableMotor=True, + enableLimit=True, + maxMotorTorque=MOTORS_TORQUE, + motorSpeed = 1, + lowerAngle = -1.6, + upperAngle = -0.1, + ) + lower.ground_contact = False + self.legs.append(lower) + self.joints.append(self.world.CreateJoint(rjd)) + + self.drawlist = self.terrain + self.legs + [self.hull] + + class LidarCallback(Box2D.b2.rayCastCallback): + def ReportFixture(self, fixture, point, normal, fraction): + if (fixture.filterData.categoryBits & 1) == 0: + return 1 + self.p2 = point + self.fraction = fraction + return 0 + self.lidar = [LidarCallback() for _ in range(10)] + + return self.step(np.array([0,0,0,0]))[0] + + def step(self, action): + #self.hull.ApplyForceToCenter((0, 20), True) -- Uncomment this to receive a bit of stability help + control_speed = False # Should be easier as well + if control_speed: + self.joints[0].motorSpeed = float(SPEED_HIP * np.clip(action[0], -1, 1)) + self.joints[1].motorSpeed = float(SPEED_KNEE * np.clip(action[1], -1, 1)) + self.joints[2].motorSpeed = float(SPEED_HIP * np.clip(action[2], -1, 1)) + self.joints[3].motorSpeed = float(SPEED_KNEE * np.clip(action[3], -1, 1)) + else: + self.joints[0].motorSpeed = float(SPEED_HIP * np.sign(action[0])) + self.joints[0].maxMotorTorque = float(MOTORS_TORQUE * np.clip(np.abs(action[0]), 0, 1)) + self.joints[1].motorSpeed = float(SPEED_KNEE * np.sign(action[1])) + self.joints[1].maxMotorTorque = float(MOTORS_TORQUE * np.clip(np.abs(action[1]), 0, 1)) + self.joints[2].motorSpeed = float(SPEED_HIP * np.sign(action[2])) + self.joints[2].maxMotorTorque = float(MOTORS_TORQUE * np.clip(np.abs(action[2]), 0, 1)) + self.joints[3].motorSpeed = float(SPEED_KNEE * np.sign(action[3])) + self.joints[3].maxMotorTorque = float(MOTORS_TORQUE * np.clip(np.abs(action[3]), 0, 1)) + + self.world.Step(1.0/FPS, 6*30, 2*30) + + pos = self.hull.position + vel = self.hull.linearVelocity + + for i in range(10): + self.lidar[i].fraction = 1.0 + self.lidar[i].p1 = pos + self.lidar[i].p2 = ( + pos[0] + math.sin(1.5*i/10.0)*LIDAR_RANGE, + pos[1] - math.cos(1.5*i/10.0)*LIDAR_RANGE) + self.world.RayCast(self.lidar[i], self.lidar[i].p1, self.lidar[i].p2) + + state = [ + self.hull.angle, # Normal angles up to 0.5 here, but sure more is possible. + 2.0*self.hull.angularVelocity/FPS, + 0.3*vel.x*(VIEWPORT_W/SCALE)/FPS, # Normalized to get -1..1 range + 0.3*vel.y*(VIEWPORT_H/SCALE)/FPS, + self.joints[0].angle, # This will give 1.1 on high up, but it's still OK (and there should be spikes on hiting the ground, that's normal too) + self.joints[0].speed / SPEED_HIP, + self.joints[1].angle + 1.0, + self.joints[1].speed / SPEED_KNEE, + 1.0 if self.legs[1].ground_contact else 0.0, + self.joints[2].angle, + self.joints[2].speed / SPEED_HIP, + self.joints[3].angle + 1.0, + self.joints[3].speed / SPEED_KNEE, + 1.0 if self.legs[3].ground_contact else 0.0 + ] + state += [l.fraction for l in self.lidar] + assert len(state)==24 + + self.scroll = pos.x - VIEWPORT_W/SCALE/5 + + shaping = 130*pos[0]/SCALE # moving forward is a way to receive reward (normalized to get 300 on completion) + shaping -= 5.0*abs(state[0]) # keep head straight, other than that and falling, any behavior is unpunished + + reward = 0 + if self.prev_shaping is not None: + reward = shaping - self.prev_shaping + self.prev_shaping = shaping + + for a in action: + reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1) + # normalized to about -50.0 using heuristic, more optimal agent should spend less + + done = False + if self.game_over or pos[0] < 0: + reward = -100 + done = True + if pos[0] > (TERRAIN_LENGTH-TERRAIN_GRASS)*TERRAIN_STEP: + done = True + return np.array(state), reward, done, {} + + def render(self, mode='human'): + from gym.envs.classic_control import rendering + if self.viewer is None: + self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H) + self.viewer.set_bounds(self.scroll, VIEWPORT_W/SCALE + self.scroll, 0, VIEWPORT_H/SCALE) + + self.viewer.draw_polygon( [ + (self.scroll, 0), + (self.scroll+VIEWPORT_W/SCALE, 0), + (self.scroll+VIEWPORT_W/SCALE, VIEWPORT_H/SCALE), + (self.scroll, VIEWPORT_H/SCALE), + ], color=(0.9, 0.9, 1.0) ) + for poly,x1,x2 in self.cloud_poly: + if x2 < self.scroll/2: continue + if x1 > self.scroll/2 + VIEWPORT_W/SCALE: continue + self.viewer.draw_polygon( [(p[0]+self.scroll/2, p[1]) for p in poly], color=(1,1,1)) + for poly, color in self.terrain_poly: + if poly[1][0] < self.scroll: continue + if poly[0][0] > self.scroll + VIEWPORT_W/SCALE: continue + self.viewer.draw_polygon(poly, color=color) + + self.lidar_render = (self.lidar_render+1) % 100 + i = self.lidar_render + if i < 2*len(self.lidar): + l = self.lidar[i] if i < len(self.lidar) else self.lidar[len(self.lidar)-i-1] + self.viewer.draw_polyline( [l.p1, l.p2], color=(1,0,0), linewidth=1 ) + + for obj in self.drawlist: + for f in obj.fixtures: + trans = f.body.transform + if type(f.shape) is circleShape: + t = rendering.Transform(translation=trans*f.shape.pos) + self.viewer.draw_circle(f.shape.radius, 30, color=obj.color1).add_attr(t) + self.viewer.draw_circle(f.shape.radius, 30, color=obj.color2, filled=False, linewidth=2).add_attr(t) + else: + path = [trans*v for v in f.shape.vertices] + self.viewer.draw_polygon(path, color=obj.color1) + path.append(path[0]) + self.viewer.draw_polyline(path, color=obj.color2, linewidth=2) + + flagy1 = TERRAIN_HEIGHT + flagy2 = flagy1 + 50/SCALE + x = TERRAIN_STEP*3 + self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(0,0,0), linewidth=2 ) + f = [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)] + self.viewer.draw_polygon(f, color=(0.9,0.2,0) ) + self.viewer.draw_polyline(f + [f[0]], color=(0,0,0), linewidth=2 ) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer is not None: + self.viewer.close() + self.viewer = None + +class BipedalWalkerHardcore(BipedalWalker): + hardcore = True + +if __name__=="__main__": + # Heurisic: suboptimal, have no notion of balance. + env = BipedalWalker() + env.reset() + steps = 0 + total_reward = 0 + a = np.array([0.0, 0.0, 0.0, 0.0]) + STAY_ON_ONE_LEG, PUT_OTHER_DOWN, PUSH_OFF = 1,2,3 + SPEED = 0.29 # Will fall forward on higher speed + state = STAY_ON_ONE_LEG + moving_leg = 0 + supporting_leg = 1 - moving_leg + SUPPORT_KNEE_ANGLE = +0.1 + supporting_knee_angle = SUPPORT_KNEE_ANGLE + while True: + s, r, done, info = env.step(a) + total_reward += r + if steps % 20 == 0 or done: + print("\naction " + str(["{:+0.2f}".format(x) for x in a])) + print("step {} total_reward {:+0.2f}".format(steps, total_reward)) + print("hull " + str(["{:+0.2f}".format(x) for x in s[0:4] ])) + print("leg0 " + str(["{:+0.2f}".format(x) for x in s[4:9] ])) + print("leg1 " + str(["{:+0.2f}".format(x) for x in s[9:14]])) + steps += 1 + + contact0 = s[8] + contact1 = s[13] + moving_s_base = 4 + 5*moving_leg + supporting_s_base = 4 + 5*supporting_leg + + hip_targ = [None,None] # -0.8 .. +1.1 + knee_targ = [None,None] # -0.6 .. +0.9 + hip_todo = [0.0, 0.0] + knee_todo = [0.0, 0.0] + + if state==STAY_ON_ONE_LEG: + hip_targ[moving_leg] = 1.1 + knee_targ[moving_leg] = -0.6 + supporting_knee_angle += 0.03 + if s[2] > SPEED: supporting_knee_angle += 0.03 + supporting_knee_angle = min( supporting_knee_angle, SUPPORT_KNEE_ANGLE ) + knee_targ[supporting_leg] = supporting_knee_angle + if s[supporting_s_base+0] < 0.10: # supporting leg is behind + state = PUT_OTHER_DOWN + if state==PUT_OTHER_DOWN: + hip_targ[moving_leg] = +0.1 + knee_targ[moving_leg] = SUPPORT_KNEE_ANGLE + knee_targ[supporting_leg] = supporting_knee_angle + if s[moving_s_base+4]: + state = PUSH_OFF + supporting_knee_angle = min( s[moving_s_base+2], SUPPORT_KNEE_ANGLE ) + if state==PUSH_OFF: + knee_targ[moving_leg] = supporting_knee_angle + knee_targ[supporting_leg] = +1.0 + if s[supporting_s_base+2] > 0.88 or s[2] > 1.2*SPEED: + state = STAY_ON_ONE_LEG + moving_leg = 1 - moving_leg + supporting_leg = 1 - moving_leg + + if hip_targ[0]: hip_todo[0] = 0.9*(hip_targ[0] - s[4]) - 0.25*s[5] + if hip_targ[1]: hip_todo[1] = 0.9*(hip_targ[1] - s[9]) - 0.25*s[10] + if knee_targ[0]: knee_todo[0] = 4.0*(knee_targ[0] - s[6]) - 0.25*s[7] + if knee_targ[1]: knee_todo[1] = 4.0*(knee_targ[1] - s[11]) - 0.25*s[12] + + hip_todo[0] -= 0.9*(0-s[0]) - 1.5*s[1] # PID to keep head strait + hip_todo[1] -= 0.9*(0-s[0]) - 1.5*s[1] + knee_todo[0] -= 15.0*s[3] # vertical speed, to damp oscillations + knee_todo[1] -= 15.0*s[3] + + a[0] = hip_todo[0] + a[1] = knee_todo[0] + a[2] = hip_todo[1] + a[3] = knee_todo[1] + a = np.clip(0.5*a, -1.0, 1.0) + + env.render() + if done: break diff --git a/src/gym/envs/box2d/car_dynamics.py b/src/gym/envs/box2d/car_dynamics.py new file mode 100644 index 0000000..02f6815 --- /dev/null +++ b/src/gym/envs/box2d/car_dynamics.py @@ -0,0 +1,244 @@ +import numpy as np +import math +import Box2D +from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener, shape) + +# Top-down car dynamics simulation. +# +# Some ideas are taken from this great tutorial http://www.iforce2d.net/b2dtut/top-down-car by Chris Campbell. +# This simulation is a bit more detailed, with wheels rotation. +# +# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. + +SIZE = 0.02 +ENGINE_POWER = 100000000*SIZE*SIZE +WHEEL_MOMENT_OF_INERTIA = 4000*SIZE*SIZE +FRICTION_LIMIT = 1000000*SIZE*SIZE # friction ~= mass ~= size^2 (calculated implicitly using density) +WHEEL_R = 27 +WHEEL_W = 14 +WHEELPOS = [ + (-55,+80), (+55,+80), + (-55,-82), (+55,-82) + ] +HULL_POLY1 =[ + (-60,+130), (+60,+130), + (+60,+110), (-60,+110) + ] +HULL_POLY2 =[ + (-15,+120), (+15,+120), + (+20, +20), (-20, 20) + ] +HULL_POLY3 =[ + (+25, +20), + (+50, -10), + (+50, -40), + (+20, -90), + (-20, -90), + (-50, -40), + (-50, -10), + (-25, +20) + ] +HULL_POLY4 =[ + (-50,-120), (+50,-120), + (+50,-90), (-50,-90) + ] +WHEEL_COLOR = (0.0,0.0,0.0) +WHEEL_WHITE = (0.3,0.3,0.3) +MUD_COLOR = (0.4,0.4,0.0) + +class Car: + def __init__(self, world, init_angle, init_x, init_y): + self.world = world + self.hull = self.world.CreateDynamicBody( + position = (init_x, init_y), + angle = init_angle, + fixtures = [ + fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY1 ]), density=1.0), + fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY2 ]), density=1.0), + fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY3 ]), density=1.0), + fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY4 ]), density=1.0) + ] + ) + self.hull.color = (0.8,0.0,0.0) + self.wheels = [] + self.fuel_spent = 0.0 + WHEEL_POLY = [ + (-WHEEL_W,+WHEEL_R), (+WHEEL_W,+WHEEL_R), + (+WHEEL_W,-WHEEL_R), (-WHEEL_W,-WHEEL_R) + ] + for wx,wy in WHEELPOS: + front_k = 1.0 if wy > 0 else 1.0 + w = self.world.CreateDynamicBody( + position = (init_x+wx*SIZE, init_y+wy*SIZE), + angle = init_angle, + fixtures = fixtureDef( + shape=polygonShape(vertices=[ (x*front_k*SIZE,y*front_k*SIZE) for x,y in WHEEL_POLY ]), + density=0.1, + categoryBits=0x0020, + maskBits=0x001, + restitution=0.0) + ) + w.wheel_rad = front_k*WHEEL_R*SIZE + w.color = WHEEL_COLOR + w.gas = 0.0 + w.brake = 0.0 + w.steer = 0.0 + w.phase = 0.0 # wheel angle + w.omega = 0.0 # angular velocity + w.skid_start = None + w.skid_particle = None + rjd = revoluteJointDef( + bodyA=self.hull, + bodyB=w, + localAnchorA=(wx*SIZE,wy*SIZE), + localAnchorB=(0,0), + enableMotor=True, + enableLimit=True, + maxMotorTorque=180*900*SIZE*SIZE, + motorSpeed = 0, + lowerAngle = -0.4, + upperAngle = +0.4, + ) + w.joint = self.world.CreateJoint(rjd) + w.tiles = set() + w.userData = w + self.wheels.append(w) + self.drawlist = self.wheels + [self.hull] + self.particles = [] + + def gas(self, gas): + 'control: rear wheel drive' + gas = np.clip(gas, 0, 1) + for w in self.wheels[2:4]: + diff = gas - w.gas + if diff > 0.1: diff = 0.1 # gradually increase, but stop immediately + w.gas += diff + + def brake(self, b): + 'control: brake b=0..1, more than 0.9 blocks wheels to zero rotation' + for w in self.wheels: + w.brake = b + + def steer(self, s): + 'control: steer s=-1..1, it takes time to rotate steering wheel from side to side, s is target position' + self.wheels[0].steer = s + self.wheels[1].steer = s + + def step(self, dt): + for w in self.wheels: + # Steer each wheel + dir = np.sign(w.steer - w.joint.angle) + val = abs(w.steer - w.joint.angle) + w.joint.motorSpeed = dir*min(50.0*val, 3.0) + + # Position => friction_limit + grass = True + friction_limit = FRICTION_LIMIT*0.6 # Grass friction if no tile + for tile in w.tiles: + friction_limit = max(friction_limit, FRICTION_LIMIT*tile.road_friction) + grass = False + + # Force + forw = w.GetWorldVector( (0,1) ) + side = w.GetWorldVector( (1,0) ) + v = w.linearVelocity + vf = forw[0]*v[0] + forw[1]*v[1] # forward speed + vs = side[0]*v[0] + side[1]*v[1] # side speed + + # WHEEL_MOMENT_OF_INERTIA*np.square(w.omega)/2 = E -- energy + # WHEEL_MOMENT_OF_INERTIA*w.omega * domega/dt = dE/dt = W -- power + # domega = dt*W/WHEEL_MOMENT_OF_INERTIA/w.omega + w.omega += dt*ENGINE_POWER*w.gas/WHEEL_MOMENT_OF_INERTIA/(abs(w.omega)+5.0) # small coef not to divide by zero + self.fuel_spent += dt*ENGINE_POWER*w.gas + + if w.brake >= 0.9: + w.omega = 0 + elif w.brake > 0: + BRAKE_FORCE = 15 # radians per second + dir = -np.sign(w.omega) + val = BRAKE_FORCE*w.brake + if abs(val) > abs(w.omega): val = abs(w.omega) # low speed => same as = 0 + w.omega += dir*val + w.phase += w.omega*dt + + vr = w.omega*w.wheel_rad # rotating wheel speed + f_force = -vf + vr # force direction is direction of speed difference + p_force = -vs + + # Physically correct is to always apply friction_limit until speed is equal. + # But dt is finite, that will lead to oscillations if difference is already near zero. + f_force *= 205000*SIZE*SIZE # Random coefficient to cut oscillations in few steps (have no effect on friction_limit) + p_force *= 205000*SIZE*SIZE + force = np.sqrt(np.square(f_force) + np.square(p_force)) + + # Skid trace + if abs(force) > 2.0*friction_limit: + if w.skid_particle and w.skid_particle.grass==grass and len(w.skid_particle.poly) < 30: + w.skid_particle.poly.append( (w.position[0], w.position[1]) ) + elif w.skid_start is None: + w.skid_start = w.position + else: + w.skid_particle = self._create_particle( w.skid_start, w.position, grass ) + w.skid_start = None + else: + w.skid_start = None + w.skid_particle = None + + if abs(force) > friction_limit: + f_force /= force + p_force /= force + force = friction_limit # Correct physics here + f_force *= force + p_force *= force + + w.omega -= dt*f_force*w.wheel_rad/WHEEL_MOMENT_OF_INERTIA + + w.ApplyForceToCenter( ( + p_force*side[0] + f_force*forw[0], + p_force*side[1] + f_force*forw[1]), True ) + + def draw(self, viewer, draw_particles=True): + if draw_particles: + for p in self.particles: + viewer.draw_polyline(p.poly, color=p.color, linewidth=5) + for obj in self.drawlist: + for f in obj.fixtures: + trans = f.body.transform + path = [trans*v for v in f.shape.vertices] + viewer.draw_polygon(path, color=obj.color) + if "phase" not in obj.__dict__: continue + a1 = obj.phase + a2 = obj.phase + 1.2 # radians + s1 = math.sin(a1) + s2 = math.sin(a2) + c1 = math.cos(a1) + c2 = math.cos(a2) + if s1>0 and s2>0: continue + if s1>0: c1 = np.sign(c1) + if s2>0: c2 = np.sign(c2) + white_poly = [ + (-WHEEL_W*SIZE, +WHEEL_R*c1*SIZE), (+WHEEL_W*SIZE, +WHEEL_R*c1*SIZE), + (+WHEEL_W*SIZE, +WHEEL_R*c2*SIZE), (-WHEEL_W*SIZE, +WHEEL_R*c2*SIZE) + ] + viewer.draw_polygon([trans*v for v in white_poly], color=WHEEL_WHITE) + + def _create_particle(self, point1, point2, grass): + class Particle: + pass + p = Particle() + p.color = WHEEL_COLOR if not grass else MUD_COLOR + p.ttl = 1 + p.poly = [(point1[0],point1[1]), (point2[0],point2[1])] + p.grass = grass + self.particles.append(p) + while len(self.particles) > 30: + self.particles.pop(0) + return p + + def destroy(self): + self.world.DestroyBody(self.hull) + self.hull = None + for w in self.wheels: + self.world.DestroyBody(w) + self.wheels = [] + diff --git a/src/gym/envs/box2d/car_racing.py b/src/gym/envs/box2d/car_racing.py new file mode 100644 index 0000000..5586413 --- /dev/null +++ b/src/gym/envs/box2d/car_racing.py @@ -0,0 +1,498 @@ +import sys, math +import numpy as np + +import Box2D +from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) + +import gym +from gym import spaces +from gym.envs.box2d.car_dynamics import Car +from gym.utils import colorize, seeding, EzPickle + +import pyglet +from pyglet import gl + +# Easiest continuous control task to learn from pixels, a top-down racing environment. +# Discreet control is reasonable in this environment as well, on/off discretisation is +# fine. +# +# State consists of STATE_W x STATE_H pixels. +# +# Reward is -0.1 every frame and +1000/N for every track tile visited, where N is +# the total number of tiles in track. For example, if you have finished in 732 frames, +# your reward is 1000 - 0.1*732 = 926.8 points. +# +# Game is solved when agent consistently gets 900+ points. Track is random every episode. +# +# Episode finishes when all tiles are visited. Car also can go outside of PLAYFIELD, that +# is far off the track, then it will get -100 and die. +# +# Some indicators shown at the bottom of the window and the state RGB buffer. From +# left to right: true speed, four ABS sensors, steering wheel position, gyroscope. +# +# To play yourself (it's rather fast for humans), type: +# +# python gym/envs/box2d/car_racing.py +# +# Remember it's powerful rear-wheel drive car, don't press accelerator and turn at the +# same time. +# +# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. + +STATE_W = 96 # less than Atari 160x192 +STATE_H = 96 +VIDEO_W = 600 +VIDEO_H = 400 +WINDOW_W = 1200 +WINDOW_H = 1000 + +SCALE = 6.0 # Track scale +TRACK_RAD = 900/SCALE # Track is heavily morphed circle with this radius +PLAYFIELD = 2000/SCALE # Game over boundary +FPS = 50 +ZOOM = 2.7 # Camera zoom +ZOOM_FOLLOW = True # Set to False for fixed view (don't use zoom) + + +TRACK_DETAIL_STEP = 21/SCALE +TRACK_TURN_RATE = 0.31 +TRACK_WIDTH = 40/SCALE +BORDER = 8/SCALE +BORDER_MIN_COUNT = 4 + +ROAD_COLOR = [0.4, 0.4, 0.4] + +class FrictionDetector(contactListener): + def __init__(self, env): + contactListener.__init__(self) + self.env = env + def BeginContact(self, contact): + self._contact(contact, True) + def EndContact(self, contact): + self._contact(contact, False) + def _contact(self, contact, begin): + tile = None + obj = None + u1 = contact.fixtureA.body.userData + u2 = contact.fixtureB.body.userData + if u1 and "road_friction" in u1.__dict__: + tile = u1 + obj = u2 + if u2 and "road_friction" in u2.__dict__: + tile = u2 + obj = u1 + if not tile: return + + tile.color[0] = ROAD_COLOR[0] + tile.color[1] = ROAD_COLOR[1] + tile.color[2] = ROAD_COLOR[2] + if not obj or "tiles" not in obj.__dict__: return + if begin: + obj.tiles.add(tile) + #print tile.road_friction, "ADD", len(obj.tiles) + if not tile.road_visited: + tile.road_visited = True + self.env.reward += 1000.0/len(self.env.track) + self.env.tile_visited_count += 1 + else: + obj.tiles.remove(tile) + #print tile.road_friction, "DEL", len(obj.tiles) -- should delete to zero when on grass (this works) + +class CarRacing(gym.Env, EzPickle): + metadata = { + 'render.modes': ['human', 'rgb_array', 'state_pixels'], + 'video.frames_per_second' : FPS + } + + def __init__(self): + EzPickle.__init__(self) + self.seed() + self.contactListener_keepref = FrictionDetector(self) + self.world = Box2D.b2World((0,0), contactListener=self.contactListener_keepref) + self.viewer = None + self.invisible_state_window = None + self.invisible_video_window = None + self.road = None + self.car = None + self.reward = 0.0 + self.prev_reward = 0.0 + + self.action_space = spaces.Box( np.array([-1,0,0]), np.array([+1,+1,+1]), dtype=np.float32) # steer, gas, brake + self.observation_space = spaces.Box(low=0, high=255, shape=(STATE_H, STATE_W, 3), dtype=np.uint8) + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _destroy(self): + if not self.road: return + for t in self.road: + self.world.DestroyBody(t) + self.road = [] + self.car.destroy() + + def _create_track(self): + CHECKPOINTS = 12 + + # Create checkpoints + checkpoints = [] + for c in range(CHECKPOINTS): + alpha = 2*math.pi*c/CHECKPOINTS + self.np_random.uniform(0, 2*math.pi*1/CHECKPOINTS) + rad = self.np_random.uniform(TRACK_RAD/3, TRACK_RAD) + if c==0: + alpha = 0 + rad = 1.5*TRACK_RAD + if c==CHECKPOINTS-1: + alpha = 2*math.pi*c/CHECKPOINTS + self.start_alpha = 2*math.pi*(-0.5)/CHECKPOINTS + rad = 1.5*TRACK_RAD + checkpoints.append( (alpha, rad*math.cos(alpha), rad*math.sin(alpha)) ) + + #print "\n".join(str(h) for h in checkpoints) + #self.road_poly = [ ( # uncomment this to see checkpoints + # [ (tx,ty) for a,tx,ty in checkpoints ], + # (0.7,0.7,0.9) ) ] + self.road = [] + + # Go from one checkpoint to another to create track + x, y, beta = 1.5*TRACK_RAD, 0, 0 + dest_i = 0 + laps = 0 + track = [] + no_freeze = 2500 + visited_other_side = False + while 1: + alpha = math.atan2(y, x) + if visited_other_side and alpha > 0: + laps += 1 + visited_other_side = False + if alpha < 0: + visited_other_side = True + alpha += 2*math.pi + while True: # Find destination from checkpoints + failed = True + while True: + dest_alpha, dest_x, dest_y = checkpoints[dest_i % len(checkpoints)] + if alpha <= dest_alpha: + failed = False + break + dest_i += 1 + if dest_i % len(checkpoints) == 0: break + if not failed: break + alpha -= 2*math.pi + continue + r1x = math.cos(beta) + r1y = math.sin(beta) + p1x = -r1y + p1y = r1x + dest_dx = dest_x - x # vector towards destination + dest_dy = dest_y - y + proj = r1x*dest_dx + r1y*dest_dy # destination vector projected on rad + while beta - alpha > 1.5*math.pi: beta -= 2*math.pi + while beta - alpha < -1.5*math.pi: beta += 2*math.pi + prev_beta = beta + proj *= SCALE + if proj > 0.3: beta -= min(TRACK_TURN_RATE, abs(0.001*proj)) + if proj < -0.3: beta += min(TRACK_TURN_RATE, abs(0.001*proj)) + x += p1x*TRACK_DETAIL_STEP + y += p1y*TRACK_DETAIL_STEP + track.append( (alpha,prev_beta*0.5 + beta*0.5,x,y) ) + if laps > 4: break + no_freeze -= 1 + if no_freeze==0: break + #print "\n".join([str(t) for t in enumerate(track)]) + + # Find closed loop range i1..i2, first loop should be ignored, second is OK + i1, i2 = -1, -1 + i = len(track) + while True: + i -= 1 + if i==0: return False # Failed + pass_through_start = track[i][0] > self.start_alpha and track[i-1][0] <= self.start_alpha + if pass_through_start and i2==-1: + i2 = i + elif pass_through_start and i1==-1: + i1 = i + break + print("Track generation: %i..%i -> %i-tiles track" % (i1, i2, i2-i1)) + assert i1!=-1 + assert i2!=-1 + + track = track[i1:i2-1] + + first_beta = track[0][1] + first_perp_x = math.cos(first_beta) + first_perp_y = math.sin(first_beta) + # Length of perpendicular jump to put together head and tail + well_glued_together = np.sqrt( + np.square( first_perp_x*(track[0][2] - track[-1][2]) ) + + np.square( first_perp_y*(track[0][3] - track[-1][3]) )) + if well_glued_together > TRACK_DETAIL_STEP: + return False + + # Red-white border on hard turns + border = [False]*len(track) + for i in range(len(track)): + good = True + oneside = 0 + for neg in range(BORDER_MIN_COUNT): + beta1 = track[i-neg-0][1] + beta2 = track[i-neg-1][1] + good &= abs(beta1 - beta2) > TRACK_TURN_RATE*0.2 + oneside += np.sign(beta1 - beta2) + good &= abs(oneside) == BORDER_MIN_COUNT + border[i] = good + for i in range(len(track)): + for neg in range(BORDER_MIN_COUNT): + border[i-neg] |= border[i] + + # Create tiles + for i in range(len(track)): + alpha1, beta1, x1, y1 = track[i] + alpha2, beta2, x2, y2 = track[i-1] + road1_l = (x1 - TRACK_WIDTH*math.cos(beta1), y1 - TRACK_WIDTH*math.sin(beta1)) + road1_r = (x1 + TRACK_WIDTH*math.cos(beta1), y1 + TRACK_WIDTH*math.sin(beta1)) + road2_l = (x2 - TRACK_WIDTH*math.cos(beta2), y2 - TRACK_WIDTH*math.sin(beta2)) + road2_r = (x2 + TRACK_WIDTH*math.cos(beta2), y2 + TRACK_WIDTH*math.sin(beta2)) + t = self.world.CreateStaticBody( fixtures = fixtureDef( + shape=polygonShape(vertices=[road1_l, road1_r, road2_r, road2_l]) + )) + t.userData = t + c = 0.01*(i%3) + t.color = [ROAD_COLOR[0] + c, ROAD_COLOR[1] + c, ROAD_COLOR[2] + c] + t.road_visited = False + t.road_friction = 1.0 + t.fixtures[0].sensor = True + self.road_poly.append(( [road1_l, road1_r, road2_r, road2_l], t.color )) + self.road.append(t) + if border[i]: + side = np.sign(beta2 - beta1) + b1_l = (x1 + side* TRACK_WIDTH *math.cos(beta1), y1 + side* TRACK_WIDTH *math.sin(beta1)) + b1_r = (x1 + side*(TRACK_WIDTH+BORDER)*math.cos(beta1), y1 + side*(TRACK_WIDTH+BORDER)*math.sin(beta1)) + b2_l = (x2 + side* TRACK_WIDTH *math.cos(beta2), y2 + side* TRACK_WIDTH *math.sin(beta2)) + b2_r = (x2 + side*(TRACK_WIDTH+BORDER)*math.cos(beta2), y2 + side*(TRACK_WIDTH+BORDER)*math.sin(beta2)) + self.road_poly.append(( [b1_l, b1_r, b2_r, b2_l], (1,1,1) if i%2==0 else (1,0,0) )) + self.track = track + return True + + def reset(self): + self._destroy() + self.reward = 0.0 + self.prev_reward = 0.0 + self.tile_visited_count = 0 + self.t = 0.0 + self.road_poly = [] + self.human_render = False + + while True: + success = self._create_track() + if success: break + print("retry to generate track (normal if there are not many of this messages)") + self.car = Car(self.world, *self.track[0][1:4]) + + return self.step(None)[0] + + def step(self, action): + if action is not None: + self.car.steer(-action[0]) + self.car.gas(action[1]) + self.car.brake(action[2]) + + self.car.step(1.0/FPS) + self.world.Step(1.0/FPS, 6*30, 2*30) + self.t += 1.0/FPS + + self.state = self.render("state_pixels") + + step_reward = 0 + done = False + if action is not None: # First step without action, called from reset() + self.reward -= 0.1 + # We actually don't want to count fuel spent, we want car to be faster. + #self.reward -= 10 * self.car.fuel_spent / ENGINE_POWER + self.car.fuel_spent = 0.0 + step_reward = self.reward - self.prev_reward + self.prev_reward = self.reward + if self.tile_visited_count==len(self.track): + done = True + x, y = self.car.hull.position + if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD: + done = True + step_reward = -100 + + return self.state, step_reward, done, {} + + def render(self, mode='human'): + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(WINDOW_W, WINDOW_H) + self.score_label = pyglet.text.Label('0000', font_size=36, + x=20, y=WINDOW_H*2.5/40.00, anchor_x='left', anchor_y='center', + color=(255,255,255,255)) + self.transform = rendering.Transform() + + if "t" not in self.__dict__: return # reset() not called yet + + zoom = 0.1*SCALE*max(1-self.t, 0) + ZOOM*SCALE*min(self.t, 1) # Animate zoom first second + zoom_state = ZOOM*SCALE*STATE_W/WINDOW_W + zoom_video = ZOOM*SCALE*VIDEO_W/WINDOW_W + scroll_x = self.car.hull.position[0] + scroll_y = self.car.hull.position[1] + angle = -self.car.hull.angle + vel = self.car.hull.linearVelocity + if np.linalg.norm(vel) > 0.5: + angle = math.atan2(vel[0], vel[1]) + self.transform.set_scale(zoom, zoom) + self.transform.set_translation( + WINDOW_W/2 - (scroll_x*zoom*math.cos(angle) - scroll_y*zoom*math.sin(angle)), + WINDOW_H/4 - (scroll_x*zoom*math.sin(angle) + scroll_y*zoom*math.cos(angle)) ) + self.transform.set_rotation(angle) + + self.car.draw(self.viewer, mode!="state_pixels") + + arr = None + win = self.viewer.window + if mode != 'state_pixels': + win.switch_to() + win.dispatch_events() + if mode=="rgb_array" or mode=="state_pixels": + win.clear() + t = self.transform + if mode=='rgb_array': + VP_W = VIDEO_W + VP_H = VIDEO_H + else: + VP_W = STATE_W + VP_H = STATE_H + gl.glViewport(0, 0, VP_W, VP_H) + t.enable() + self.render_road() + for geom in self.viewer.onetime_geoms: + geom.render() + t.disable() + self.render_indicators(WINDOW_W, WINDOW_H) # TODO: find why 2x needed, wtf + image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() + arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') + arr = arr.reshape(VP_H, VP_W, 4) + arr = arr[::-1, :, 0:3] + + if mode=="rgb_array" and not self.human_render: # agent can call or not call env.render() itself when recording video. + win.flip() + + if mode=='human': + self.human_render = True + win.clear() + t = self.transform + gl.glViewport(0, 0, WINDOW_W, WINDOW_H) + t.enable() + self.render_road() + for geom in self.viewer.onetime_geoms: + geom.render() + t.disable() + self.render_indicators(WINDOW_W, WINDOW_H) + win.flip() + + self.viewer.onetime_geoms = [] + return arr + + def close(self): + if self.viewer is not None: + self.viewer.close() + self.viewer = None + + def render_road(self): + gl.glBegin(gl.GL_QUADS) + gl.glColor4f(0.4, 0.8, 0.4, 1.0) + gl.glVertex3f(-PLAYFIELD, +PLAYFIELD, 0) + gl.glVertex3f(+PLAYFIELD, +PLAYFIELD, 0) + gl.glVertex3f(+PLAYFIELD, -PLAYFIELD, 0) + gl.glVertex3f(-PLAYFIELD, -PLAYFIELD, 0) + gl.glColor4f(0.4, 0.9, 0.4, 1.0) + k = PLAYFIELD/20.0 + for x in range(-20, 20, 2): + for y in range(-20, 20, 2): + gl.glVertex3f(k*x + k, k*y + 0, 0) + gl.glVertex3f(k*x + 0, k*y + 0, 0) + gl.glVertex3f(k*x + 0, k*y + k, 0) + gl.glVertex3f(k*x + k, k*y + k, 0) + for poly, color in self.road_poly: + gl.glColor4f(color[0], color[1], color[2], 1) + for p in poly: + gl.glVertex3f(p[0], p[1], 0) + gl.glEnd() + + def render_indicators(self, W, H): + gl.glBegin(gl.GL_QUADS) + s = W/40.0 + h = H/40.0 + gl.glColor4f(0,0,0,1) + gl.glVertex3f(W, 0, 0) + gl.glVertex3f(W, 5*h, 0) + gl.glVertex3f(0, 5*h, 0) + gl.glVertex3f(0, 0, 0) + def vertical_ind(place, val, color): + gl.glColor4f(color[0], color[1], color[2], 1) + gl.glVertex3f((place+0)*s, h + h*val, 0) + gl.glVertex3f((place+1)*s, h + h*val, 0) + gl.glVertex3f((place+1)*s, h, 0) + gl.glVertex3f((place+0)*s, h, 0) + def horiz_ind(place, val, color): + gl.glColor4f(color[0], color[1], color[2], 1) + gl.glVertex3f((place+0)*s, 4*h , 0) + gl.glVertex3f((place+val)*s, 4*h, 0) + gl.glVertex3f((place+val)*s, 2*h, 0) + gl.glVertex3f((place+0)*s, 2*h, 0) + true_speed = np.sqrt(np.square(self.car.hull.linearVelocity[0]) + np.square(self.car.hull.linearVelocity[1])) + vertical_ind(5, 0.02*true_speed, (1,1,1)) + vertical_ind(7, 0.01*self.car.wheels[0].omega, (0.0,0,1)) # ABS sensors + vertical_ind(8, 0.01*self.car.wheels[1].omega, (0.0,0,1)) + vertical_ind(9, 0.01*self.car.wheels[2].omega, (0.2,0,1)) + vertical_ind(10,0.01*self.car.wheels[3].omega, (0.2,0,1)) + horiz_ind(20, -10.0*self.car.wheels[0].joint.angle, (0,1,0)) + horiz_ind(30, -0.8*self.car.hull.angularVelocity, (1,0,0)) + gl.glEnd() + self.score_label.text = "%04i" % self.reward + self.score_label.draw() + + +if __name__=="__main__": + from pyglet.window import key + a = np.array( [0.0, 0.0, 0.0] ) + def key_press(k, mod): + global restart + if k==0xff0d: restart = True + if k==key.LEFT: a[0] = -1.0 + if k==key.RIGHT: a[0] = +1.0 + if k==key.UP: a[1] = +1.0 + if k==key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation + def key_release(k, mod): + if k==key.LEFT and a[0]==-1.0: a[0] = 0 + if k==key.RIGHT and a[0]==+1.0: a[0] = 0 + if k==key.UP: a[1] = 0 + if k==key.DOWN: a[2] = 0 + env = CarRacing() + env.render() + record_video = False + if record_video: + env.monitor.start('/tmp/video-test', force=True) + env.viewer.window.on_key_press = key_press + env.viewer.window.on_key_release = key_release + while True: + env.reset() + total_reward = 0.0 + steps = 0 + restart = False + while True: + s, r, done, info = env.step(a) + total_reward += r + if steps % 200 == 0 or done: + print("\naction " + str(["{:+0.2f}".format(x) for x in a])) + print("step {} total_reward {:+0.2f}".format(steps, total_reward)) + #import matplotlib.pyplot as plt + #plt.imshow(s) + #plt.savefig("test.jpeg") + steps += 1 + if not record_video: # Faster, but you can as well call env.render() every time to play full window. + env.render() + if done or restart: break + env.close() diff --git a/src/gym/envs/box2d/lunar_lander.py b/src/gym/envs/box2d/lunar_lander.py new file mode 100644 index 0000000..22d6c6b --- /dev/null +++ b/src/gym/envs/box2d/lunar_lander.py @@ -0,0 +1,420 @@ +import sys, math +import numpy as np + +import Box2D +from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) + +import gym +from gym import spaces +from gym.utils import seeding, EzPickle + +# Rocket trajectory optimization is a classic topic in Optimal Control. +# +# According to Pontryagin's maximum principle it's optimal to fire engine full throttle or +# turn it off. That's the reason this environment is OK to have discreet actions (engine on or off). +# +# Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. +# Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. +# If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or +# comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main +# engine is -0.3 points each frame. Solved is 200 points. +# +# Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land +# on its first attempt. Please see source code for details. +# +# Too see heuristic landing, run: +# +# python gym/envs/box2d/lunar_lander.py +# +# To play yourself, run: +# +# python examples/agents/keyboard_agent.py LunarLander-v2 +# +# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. + +FPS = 50 +SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well + +MAIN_ENGINE_POWER = 13.0 +SIDE_ENGINE_POWER = 0.6 + +INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder + +LANDER_POLY =[ + (-14,+17), (-17,0), (-17,-10), + (+17,-10), (+17,0), (+14,+17) + ] +LEG_AWAY = 20 +LEG_DOWN = 18 +LEG_W, LEG_H = 2, 8 +LEG_SPRING_TORQUE = 40 + +SIDE_ENGINE_HEIGHT = 14.0 +SIDE_ENGINE_AWAY = 12.0 + +VIEWPORT_W = 600 +VIEWPORT_H = 400 + +class ContactDetector(contactListener): + def __init__(self, env): + contactListener.__init__(self) + self.env = env + def BeginContact(self, contact): + if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body: + self.env.game_over = True + for i in range(2): + if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: + self.env.legs[i].ground_contact = True + def EndContact(self, contact): + for i in range(2): + if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: + self.env.legs[i].ground_contact = False + +class LunarLander(gym.Env, EzPickle): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : FPS + } + + continuous = False + + def __init__(self): + EzPickle.__init__(self) + self.seed() + self.viewer = None + + self.world = Box2D.b2World() + self.moon = None + self.lander = None + self.particles = [] + + self.prev_reward = None + + # useful range is -1 .. +1, but spikes can be higher + self.observation_space = spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32) + + if self.continuous: + # Action is two floats [main engine, left-right engines]. + # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. + # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off + self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32) + else: + # Nop, fire left engine, main engine, right engine + self.action_space = spaces.Discrete(4) + + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _destroy(self): + if not self.moon: return + self.world.contactListener = None + self._clean_particles(True) + self.world.DestroyBody(self.moon) + self.moon = None + self.world.DestroyBody(self.lander) + self.lander = None + self.world.DestroyBody(self.legs[0]) + self.world.DestroyBody(self.legs[1]) + + def reset(self): + self._destroy() + self.world.contactListener_keepref = ContactDetector(self) + self.world.contactListener = self.world.contactListener_keepref + self.game_over = False + self.prev_shaping = None + + W = VIEWPORT_W/SCALE + H = VIEWPORT_H/SCALE + + # terrain + CHUNKS = 11 + height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) ) + chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)] + self.helipad_x1 = chunk_x[CHUNKS//2-1] + self.helipad_x2 = chunk_x[CHUNKS//2+1] + self.helipad_y = H/4 + height[CHUNKS//2-2] = self.helipad_y + height[CHUNKS//2-1] = self.helipad_y + height[CHUNKS//2+0] = self.helipad_y + height[CHUNKS//2+1] = self.helipad_y + height[CHUNKS//2+2] = self.helipad_y + smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)] + + self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) ) + self.sky_polys = [] + for i in range(CHUNKS-1): + p1 = (chunk_x[i], smooth_y[i]) + p2 = (chunk_x[i+1], smooth_y[i+1]) + self.moon.CreateEdgeFixture( + vertices=[p1,p2], + density=0, + friction=0.1) + self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] ) + + self.moon.color1 = (0.0,0.0,0.0) + self.moon.color2 = (0.0,0.0,0.0) + + initial_y = VIEWPORT_H/SCALE + self.lander = self.world.CreateDynamicBody( + position = (VIEWPORT_W/SCALE/2, initial_y), + angle=0.0, + fixtures = fixtureDef( + shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]), + density=5.0, + friction=0.1, + categoryBits=0x0010, + maskBits=0x001, # collide only with ground + restitution=0.0) # 0.99 bouncy + ) + self.lander.color1 = (0.5,0.4,0.9) + self.lander.color2 = (0.3,0.3,0.5) + self.lander.ApplyForceToCenter( ( + self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), + self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM) + ), True) + + self.legs = [] + for i in [-1,+1]: + leg = self.world.CreateDynamicBody( + position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y), + angle = (i*0.05), + fixtures = fixtureDef( + shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)), + density=1.0, + restitution=0.0, + categoryBits=0x0020, + maskBits=0x001) + ) + leg.ground_contact = False + leg.color1 = (0.5,0.4,0.9) + leg.color2 = (0.3,0.3,0.5) + rjd = revoluteJointDef( + bodyA=self.lander, + bodyB=leg, + localAnchorA=(0, 0), + localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE), + enableMotor=True, + enableLimit=True, + maxMotorTorque=LEG_SPRING_TORQUE, + motorSpeed=+0.3*i # low enough not to jump back into the sky + ) + if i==-1: + rjd.lowerAngle = +0.9 - 0.5 # Yes, the most esoteric numbers here, angles legs have freedom to travel within + rjd.upperAngle = +0.9 + else: + rjd.lowerAngle = -0.9 + rjd.upperAngle = -0.9 + 0.5 + leg.joint = self.world.CreateJoint(rjd) + self.legs.append(leg) + + self.drawlist = [self.lander] + self.legs + + return self.step(np.array([0,0]) if self.continuous else 0)[0] + + def _create_particle(self, mass, x, y, ttl): + p = self.world.CreateDynamicBody( + position = (x,y), + angle=0.0, + fixtures = fixtureDef( + shape=circleShape(radius=2/SCALE, pos=(0,0)), + density=mass, + friction=0.1, + categoryBits=0x0100, + maskBits=0x001, # collide only with ground + restitution=0.3) + ) + p.ttl = ttl + self.particles.append(p) + self._clean_particles(False) + return p + + def _clean_particles(self, all): + while self.particles and (all or self.particles[0].ttl<0): + self.world.DestroyBody(self.particles.pop(0)) + + def step(self, action): + if self.continuous: + action = np.clip(action, -1, +1).astype(np.float32) + else: + assert self.action_space.contains(action), "%r (%s) invalid " % (action, type(action)) + + # Engines + tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) + side = (-tip[1], tip[0]); + dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] + + m_power = 0.0 + if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2): + # Main engine + if self.continuous: + m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 + assert m_power>=0.5 and m_power <= 1.0 + else: + m_power = 1.0 + ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness + oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] + impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) + p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate + p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + + s_power = 0.0 + if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]): + # Orientation engines + if self.continuous: + direction = np.sign(action[1]) + s_power = np.clip(np.abs(action[1]), 0.5,1.0) + assert s_power>=0.5 and s_power <= 1.0 + else: + direction = action-2 + s_power = 1.0 + ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) + oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) + impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) + p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) + p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + + self.world.Step(1.0/FPS, 6*30, 2*30) + + pos = self.lander.position + vel = self.lander.linearVelocity + state = [ + (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2), + (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2), + vel.x*(VIEWPORT_W/SCALE/2)/FPS, + vel.y*(VIEWPORT_H/SCALE/2)/FPS, + self.lander.angle, + 20.0*self.lander.angularVelocity/FPS, + 1.0 if self.legs[0].ground_contact else 0.0, + 1.0 if self.legs[1].ground_contact else 0.0 + ] + assert len(state)==8 + + reward = 0 + shaping = \ + - 100*np.sqrt(state[0]*state[0] + state[1]*state[1]) \ + - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \ + - 100*abs(state[4]) + 10*state[6] + 10*state[7] # And ten points for legs contact, the idea is if you + # lose contact again after landing, you get negative reward + if self.prev_shaping is not None: + reward = shaping - self.prev_shaping + self.prev_shaping = shaping + + reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing + reward -= s_power*0.03 + + done = False + if self.game_over or abs(state[0]) >= 1.0: + done = True + reward = -100 + if not self.lander.awake: + done = True + reward = +100 + return np.array(state, dtype=np.float32), reward, done, {} + + def render(self, mode='human'): + from gym.envs.classic_control import rendering + if self.viewer is None: + self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H) + self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE) + + for obj in self.particles: + obj.ttl -= 0.15 + obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) + obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) + + self._clean_particles(False) + + for p in self.sky_polys: + self.viewer.draw_polygon(p, color=(0,0,0)) + + for obj in self.particles + self.drawlist: + for f in obj.fixtures: + trans = f.body.transform + if type(f.shape) is circleShape: + t = rendering.Transform(translation=trans*f.shape.pos) + self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t) + self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t) + else: + path = [trans*v for v in f.shape.vertices] + self.viewer.draw_polygon(path, color=obj.color1) + path.append(path[0]) + self.viewer.draw_polyline(path, color=obj.color2, linewidth=2) + + for x in [self.helipad_x1, self.helipad_x2]: + flagy1 = self.helipad_y + flagy2 = flagy1 + 50/SCALE + self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(1,1,1) ) + self.viewer.draw_polygon( [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)], color=(0.8,0.8,0) ) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer is not None: + self.viewer.close() + self.viewer = None + +class LunarLanderContinuous(LunarLander): + continuous = True + +def heuristic(env, s): + # Heuristic for: + # 1. Testing. + # 2. Demonstration rollout. + angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) + if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad + if angle_targ < -0.4: angle_targ = -0.4 + hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset + + # PID controller: s[4] angle, s[5] angularSpeed + angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 + #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) + + # PID controller: s[1] vertical coordinate s[3] vertical speed + hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 + #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) + + if s[6] or s[7]: # legs have contact + angle_todo = 0 + hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact + + if env.continuous: + a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) + a = np.clip(a, -1, +1) + else: + a = 0 + if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 + elif angle_todo < -0.05: a = 3 + elif angle_todo > +0.05: a = 1 + return a + +def demo_heuristic_lander(env, seed=None, render=False): + env.seed(seed) + total_reward = 0 + steps = 0 + s = env.reset() + while True: + a = heuristic(env, s) + s, r, done, info = env.step(a) + total_reward += r + + if render: + still_open = env.render() + if still_open == False: break + + if steps % 20 == 0 or done: + print("observations:", " ".join(["{:+0.2f}".format(x) for x in s])) + print("step {} total_reward {:+0.2f}".format(steps, total_reward)) + steps += 1 + if done: break + return total_reward + + +if __name__ == '__main__': + demo_heuristic_lander(LunarLander(), render=True) + + diff --git a/src/gym/envs/box2d/test_lunar_lander.py b/src/gym/envs/box2d/test_lunar_lander.py new file mode 100644 index 0000000..bdd81ce --- /dev/null +++ b/src/gym/envs/box2d/test_lunar_lander.py @@ -0,0 +1,13 @@ +from .lunar_lander import LunarLander, LunarLanderContinuous, demo_heuristic_lander + +def test_lunar_lander(): + _test_lander(LunarLander(), seed=0) + +def test_lunar_lander_continuous(): + _test_lander(LunarLanderContinuous(), seed=0) + +def _test_lander(env, seed=None, render=False): + total_reward = demo_heuristic_lander(env, seed=seed, render=render) + assert total_reward > 100 + + diff --git a/src/gym/envs/classic_control/__init__.py b/src/gym/envs/classic_control/__init__.py new file mode 100644 index 0000000..53b3ff3 --- /dev/null +++ b/src/gym/envs/classic_control/__init__.py @@ -0,0 +1,6 @@ +from gym.envs.classic_control.cartpole import CartPoleEnv +from gym.envs.classic_control.mountain_car import MountainCarEnv +from gym.envs.classic_control.continuous_mountain_car import Continuous_MountainCarEnv +from gym.envs.classic_control.pendulum import PendulumEnv +from gym.envs.classic_control.acrobot import AcrobotEnv + diff --git a/src/gym/envs/classic_control/acrobot.py b/src/gym/envs/classic_control/acrobot.py new file mode 100644 index 0000000..9a3ef4d --- /dev/null +++ b/src/gym/envs/classic_control/acrobot.py @@ -0,0 +1,304 @@ +"""classic Acrobot task""" +from gym import core, spaces +from gym.utils import seeding +import numpy as np +from numpy import sin, cos, pi + +__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" +__credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann", + "William Dabney", "Jonathan P. How"] +__license__ = "BSD 3-Clause" +__author__ = "Christoph Dann " + +# SOURCE: +# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py + +class AcrobotEnv(core.Env): + + """ + Acrobot is a 2-link pendulum with only the second joint actuated + Intitially, both links point downwards. The goal is to swing the + end-effector at a height at least the length of one link above the base. + Both links can swing freely and can pass by each other, i.e., they don't + collide when they have the same angle. + **STATE:** + The state consists of the sin() and cos() of the two rotational joint + angles and the joint angular velocities : + [cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2]. + For the first link, an angle of 0 corresponds to the link pointing downwards. + The angle of the second link is relative to the angle of the first link. + An angle of 0 corresponds to having the same angle between the two links. + A state of [1, 0, 1, 0, ..., ...] means that both links point downwards. + **ACTIONS:** + The action is either applying +1, 0 or -1 torque on the joint between + the two pendulum links. + .. note:: + The dynamics equations were missing some terms in the NIPS paper which + are present in the book. R. Sutton confirmed in personal correspondance + that the experimental results shown in the paper and the book were + generated with the equations shown in the book. + However, there is the option to run the domain with the paper equations + by setting book_or_nips = 'nips' + **REFERENCE:** + .. seealso:: + R. Sutton: Generalization in Reinforcement Learning: + Successful Examples Using Sparse Coarse Coding (NIPS 1996) + .. seealso:: + R. Sutton and A. G. Barto: + Reinforcement learning: An introduction. + Cambridge: MIT press, 1998. + .. warning:: + This version of the domain uses the Runge-Kutta method for integrating + the system dynamics and is more realistic, but also considerably harder + than the original version which employs Euler integration, + see the AcrobotLegacy class. + """ + + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : 15 + } + + dt = .2 + + LINK_LENGTH_1 = 1. # [m] + LINK_LENGTH_2 = 1. # [m] + LINK_MASS_1 = 1. #: [kg] mass of link 1 + LINK_MASS_2 = 1. #: [kg] mass of link 2 + LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 + LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 + LINK_MOI = 1. #: moments of inertia for both links + + MAX_VEL_1 = 4 * np.pi + MAX_VEL_2 = 9 * np.pi + + AVAIL_TORQUE = [-1., 0., +1] + + torque_noise_max = 0. + + #: use dynamics equations from the nips paper or the book + book_or_nips = "book" + action_arrow = None + domain_fig = None + actions_num = 3 + + def __init__(self): + self.viewer = None + high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2]) + low = -high + self.observation_space = spaces.Box(low=low, high=high) + self.action_space = spaces.Discrete(3) + self.state = None + self.seed() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def reset(self): + self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,)) + return self._get_ob() + + def step(self, a): + s = self.state + torque = self.AVAIL_TORQUE[a] + + # Add noise to the force action + if self.torque_noise_max > 0: + torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max) + + # Now, augment the state with our force action so it can be passed to + # _dsdt + s_augmented = np.append(s, torque) + + ns = rk4(self._dsdt, s_augmented, [0, self.dt]) + # only care about final timestep of integration returned by integrator + ns = ns[-1] + ns = ns[:4] # omit action + # ODEINT IS TOO SLOW! + # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt]) + # self.s_continuous = ns_continuous[-1] # We only care about the state + # at the ''final timestep'', self.dt + + ns[0] = wrap(ns[0], -pi, pi) + ns[1] = wrap(ns[1], -pi, pi) + ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) + ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) + self.state = ns + terminal = self._terminal() + reward = -1. if not terminal else 0. + return (self._get_ob(), reward, terminal, {}) + + def _get_ob(self): + s = self.state + return np.array([cos(s[0]), np.sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]]) + + def _terminal(self): + s = self.state + return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.) + + def _dsdt(self, s_augmented, t): + m1 = self.LINK_MASS_1 + m2 = self.LINK_MASS_2 + l1 = self.LINK_LENGTH_1 + lc1 = self.LINK_COM_POS_1 + lc2 = self.LINK_COM_POS_2 + I1 = self.LINK_MOI + I2 = self.LINK_MOI + g = 9.8 + a = s_augmented[-1] + s = s_augmented[:-1] + theta1 = s[0] + theta2 = s[1] + dtheta1 = s[2] + dtheta2 = s[3] + d1 = m1 * lc1 ** 2 + m2 * \ + (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2)) + I1 + I2 + d2 = m2 * (lc2 ** 2 + l1 * lc2 * np.cos(theta2)) + I2 + phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.) + phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2) \ + - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) \ + + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) + phi2 + if self.book_or_nips == "nips": + # the following line is consistent with the description in the + # paper + ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \ + (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) + else: + # the following line is consistent with the java implementation and the + # book + ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * np.sin(theta2) - phi2) \ + / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) + ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 + return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.) + + def render(self, mode='human'): + from gym.envs.classic_control import rendering + + s = self.state + + if self.viewer is None: + self.viewer = rendering.Viewer(500,500) + bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 # 2.2 for default + self.viewer.set_bounds(-bound,bound,-bound,bound) + + if s is None: return None + + p1 = [-self.LINK_LENGTH_1 * + np.cos(s[0]), self.LINK_LENGTH_1 * np.sin(s[0])] + + p2 = [p1[0] - self.LINK_LENGTH_2 * np.cos(s[0] + s[1]), + p1[1] + self.LINK_LENGTH_2 * np.sin(s[0] + s[1])] + + xys = np.array([[0,0], p1, p2])[:,::-1] + thetas = [s[0]-np.pi/2, s[0]+s[1]-np.pi/2] + link_lengths = [self.LINK_LENGTH_1, self.LINK_LENGTH_2] + + self.viewer.draw_line((-2.2, 1), (2.2, 1)) + for ((x,y),th,llen) in zip(xys, thetas, link_lengths): + l,r,t,b = 0, llen, .1, -.1 + jtransform = rendering.Transform(rotation=th, translation=(x,y)) + link = self.viewer.draw_polygon([(l,b), (l,t), (r,t), (r,b)]) + link.add_attr(jtransform) + link.set_color(0,.8, .8) + circ = self.viewer.draw_circle(.1) + circ.set_color(.8, .8, 0) + circ.add_attr(jtransform) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None + +def wrap(x, m, M): + """ + :param x: a scalar + :param m: minimum possible value in range + :param M: maximum possible value in range + Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which + truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n + For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. + """ + diff = M - m + while x > M: + x = x - diff + while x < m: + x = x + diff + return x + +def bound(x, m, M=None): + """ + :param x: scalar + Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* + have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1]. + """ + if M is None: + M = m[1] + m = m[0] + # bound x between min (m) and Max (M) + return min(max(x, m), M) + + +def rk4(derivs, y0, t, *args, **kwargs): + """ + Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta. + This is a toy implementation which may be useful if you find + yourself stranded on a system w/o scipy. Otherwise use + :func:`scipy.integrate`. + *y0* + initial state vector + *t* + sample times + *derivs* + returns the derivative of the system and has the + signature ``dy = derivs(yi, ti)`` + *args* + additional arguments passed to the derivative function + *kwargs* + additional keyword arguments passed to the derivative function + Example 1 :: + ## 2D system + def derivs6(x,t): + d1 = x[0] + 2*x[1] + d2 = -3*x[0] + 4*x[1] + return (d1, d2) + dt = 0.0005 + t = arange(0.0, 2.0, dt) + y0 = (1,2) + yout = rk4(derivs6, y0, t) + Example 2:: + ## 1D system + alpha = 2 + def derivs(x,t): + return -alpha*x + exp(-t) + y0 = 1 + yout = rk4(derivs, y0, t) + If you have access to scipy, you should probably be using the + scipy.integrate tools rather than this function. + """ + + try: + Ny = len(y0) + except TypeError: + yout = np.zeros((len(t),), np.float_) + else: + yout = np.zeros((len(t), Ny), np.float_) + + yout[0] = y0 + + + for i in np.arange(len(t) - 1): + + thist = t[i] + dt = t[i + 1] - thist + dt2 = dt / 2.0 + y0 = yout[i] + + k1 = np.asarray(derivs(y0, thist, *args, **kwargs)) + k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs)) + k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs)) + k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs)) + yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) + return yout diff --git a/src/gym/envs/classic_control/assets/clockwise.png b/src/gym/envs/classic_control/assets/clockwise.png new file mode 100644 index 0000000..1aa4236 Binary files /dev/null and b/src/gym/envs/classic_control/assets/clockwise.png differ diff --git a/src/gym/envs/classic_control/cartpole.py b/src/gym/envs/classic_control/cartpole.py new file mode 100644 index 0000000..ec0f4fb --- /dev/null +++ b/src/gym/envs/classic_control/cartpole.py @@ -0,0 +1,193 @@ +""" +Classic cart-pole system implemented by Rich Sutton et al. +Copied from http://incompleteideas.net/sutton/book/code/pole.c +permalink: https://perma.cc/C9ZM-652R +""" + +import math +import gym +from gym import spaces, logger +from gym.utils import seeding +import numpy as np + +class CartPoleEnv(gym.Env): + """ + Description: + A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity. + + Source: + This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson + + Observation: + Type: Box(4) + Num Observation Min Max + 0 Cart Position -4.8 4.8 + 1 Cart Velocity -Inf Inf + 2 Pole Angle -24° 24° + 3 Pole Velocity At Tip -Inf Inf + + Actions: + Type: Discrete(2) + Num Action + 0 Push cart to the left + 1 Push cart to the right + + Note: The amount the velocity is reduced or increased is not fixed as it depends on the angle the pole is pointing. This is because the center of gravity of the pole increases the amount of energy needed to move the cart underneath it + + Reward: + Reward is 1 for every step taken, including the termination step + + Starting State: + All observations are assigned a uniform random value between ±0.05 + + Episode Termination: + Pole Angle is more than ±12° + Cart Position is more than ±2.4 (center of the cart reaches the edge of the display) + Episode length is greater than 200 + Solved Requirements + Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials. + """ + + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : 50 + } + + def __init__(self): + self.gravity = 9.8 + self.masscart = 1.0 + self.masspole = 0.1 + self.total_mass = (self.masspole + self.masscart) + self.length = 0.5 # actually half the pole's length + self.polemass_length = (self.masspole * self.length) + self.force_mag = 10.0 + self.tau = 0.02 # seconds between state updates + self.kinematics_integrator = 'euler' + + # Angle at which to fail the episode + self.theta_threshold_radians = 12 * 2 * math.pi / 360 + self.x_threshold = 2.4 + + # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds + high = np.array([ + self.x_threshold * 2, + np.finfo(np.float32).max, + self.theta_threshold_radians * 2, + np.finfo(np.float32).max]) + + self.action_space = spaces.Discrete(2) + self.observation_space = spaces.Box(-high, high, dtype=np.float32) + + self.seed() + self.viewer = None + self.state = None + + self.steps_beyond_done = None + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action)) + state = self.state + x, x_dot, theta, theta_dot = state + force = self.force_mag if action==1 else -self.force_mag + costheta = math.cos(theta) + sintheta = math.sin(theta) + temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass + thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass)) + xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass + if self.kinematics_integrator == 'euler': + x = x + self.tau * x_dot + x_dot = x_dot + self.tau * xacc + theta = theta + self.tau * theta_dot + theta_dot = theta_dot + self.tau * thetaacc + else: # semi-implicit euler + x_dot = x_dot + self.tau * xacc + x = x + self.tau * x_dot + theta_dot = theta_dot + self.tau * thetaacc + theta = theta + self.tau * theta_dot + self.state = (x,x_dot,theta,theta_dot) + done = x < -self.x_threshold \ + or x > self.x_threshold \ + or theta < -self.theta_threshold_radians \ + or theta > self.theta_threshold_radians + done = bool(done) + + if not done: + reward = 1.0 + elif self.steps_beyond_done is None: + # Pole just fell! + self.steps_beyond_done = 0 + reward = 1.0 + else: + if self.steps_beyond_done == 0: + logger.warn("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.") + self.steps_beyond_done += 1 + reward = 0.0 + + return np.array(self.state), reward, done, {} + + def reset(self): + self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) + self.steps_beyond_done = None + return np.array(self.state) + + def render(self, mode='human'): + screen_width = 600 + screen_height = 400 + + world_width = self.x_threshold*2 + scale = screen_width/world_width + carty = 100 # TOP OF CART + polewidth = 10.0 + polelen = scale * (2 * self.length) + cartwidth = 50.0 + cartheight = 30.0 + + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(screen_width, screen_height) + l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2 + axleoffset =cartheight/4.0 + cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)]) + self.carttrans = rendering.Transform() + cart.add_attr(self.carttrans) + self.viewer.add_geom(cart) + l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2 + pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)]) + pole.set_color(.8,.6,.4) + self.poletrans = rendering.Transform(translation=(0, axleoffset)) + pole.add_attr(self.poletrans) + pole.add_attr(self.carttrans) + self.viewer.add_geom(pole) + self.axle = rendering.make_circle(polewidth/2) + self.axle.add_attr(self.poletrans) + self.axle.add_attr(self.carttrans) + self.axle.set_color(.5,.5,.8) + self.viewer.add_geom(self.axle) + self.track = rendering.Line((0,carty), (screen_width,carty)) + self.track.set_color(0,0,0) + self.viewer.add_geom(self.track) + + self._pole_geom = pole + + if self.state is None: return None + + # Edit the pole polygon vertex + pole = self._pole_geom + l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2 + pole.v = [(l,b), (l,t), (r,t), (r,b)] + + x = self.state + cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART + self.carttrans.set_translation(cartx, carty) + self.poletrans.set_rotation(-x[2]) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None diff --git a/src/gym/envs/classic_control/continuous_mountain_car.py b/src/gym/envs/classic_control/continuous_mountain_car.py new file mode 100644 index 0000000..65bec62 --- /dev/null +++ b/src/gym/envs/classic_control/continuous_mountain_car.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +""" +@author: Olivier Sigaud + +A merge between two sources: + +* Adaptation of the MountainCar Environment from the "FAReinforcement" library +of Jose Antonio Martin H. (version 1.0), adapted by 'Tom Schaul, tom@idsia.ch' +and then modified by Arnaud de Broissia + +* the OpenAI/gym MountainCar environment +itself from +http://incompleteideas.net/sutton/MountainCar/MountainCar1.cp +permalink: https://perma.cc/6Z2N-PFWC +""" + +import math +import gym +from gym import spaces +from gym.utils import seeding +import numpy as np + +class Continuous_MountainCarEnv(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + + def __init__(self): + self.min_action = -1.0 + self.max_action = 1.0 + self.min_position = -1.2 + self.max_position = 0.6 + self.max_speed = 0.07 + self.goal_position = 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version + self.power = 0.0015 + + self.low_state = np.array([self.min_position, -self.max_speed]) + self.high_state = np.array([self.max_position, self.max_speed]) + + self.viewer = None + + self.action_space = spaces.Box(low=self.min_action, high=self.max_action, shape=(1,)) + self.observation_space = spaces.Box(low=self.low_state, high=self.high_state) + + self.seed() + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + + position = self.state[0] + velocity = self.state[1] + force = min(max(action[0], -1.0), 1.0) + + velocity += force*self.power -0.0025 * math.cos(3*position) + if (velocity > self.max_speed): velocity = self.max_speed + if (velocity < -self.max_speed): velocity = -self.max_speed + position += velocity + if (position > self.max_position): position = self.max_position + if (position < self.min_position): position = self.min_position + if (position==self.min_position and velocity<0): velocity = 0 + + done = bool(position >= self.goal_position) + + reward = 0 + if done: + reward = 100.0 + reward-= math.pow(action[0],2)*0.1 + + self.state = np.array([position, velocity]) + return self.state, reward, done, {} + + def reset(self): + self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0]) + return np.array(self.state) + +# def get_state(self): +# return self.state + + def _height(self, xs): + return np.sin(3 * xs)*.45+.55 + + def render(self, mode='human'): + screen_width = 600 + screen_height = 400 + + world_width = self.max_position - self.min_position + scale = screen_width/world_width + carwidth=40 + carheight=20 + + + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(screen_width, screen_height) + xs = np.linspace(self.min_position, self.max_position, 100) + ys = self._height(xs) + xys = list(zip((xs-self.min_position)*scale, ys*scale)) + + self.track = rendering.make_polyline(xys) + self.track.set_linewidth(4) + self.viewer.add_geom(self.track) + + clearance = 10 + + l,r,t,b = -carwidth/2, carwidth/2, carheight, 0 + car = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)]) + car.add_attr(rendering.Transform(translation=(0, clearance))) + self.cartrans = rendering.Transform() + car.add_attr(self.cartrans) + self.viewer.add_geom(car) + frontwheel = rendering.make_circle(carheight/2.5) + frontwheel.set_color(.5, .5, .5) + frontwheel.add_attr(rendering.Transform(translation=(carwidth/4,clearance))) + frontwheel.add_attr(self.cartrans) + self.viewer.add_geom(frontwheel) + backwheel = rendering.make_circle(carheight/2.5) + backwheel.add_attr(rendering.Transform(translation=(-carwidth/4,clearance))) + backwheel.add_attr(self.cartrans) + backwheel.set_color(.5, .5, .5) + self.viewer.add_geom(backwheel) + flagx = (self.goal_position-self.min_position)*scale + flagy1 = self._height(self.goal_position)*scale + flagy2 = flagy1 + 50 + flagpole = rendering.Line((flagx, flagy1), (flagx, flagy2)) + self.viewer.add_geom(flagpole) + flag = rendering.FilledPolygon([(flagx, flagy2), (flagx, flagy2-10), (flagx+25, flagy2-5)]) + flag.set_color(.8,.8,0) + self.viewer.add_geom(flag) + + pos = self.state[0] + self.cartrans.set_translation((pos-self.min_position)*scale, self._height(pos)*scale) + self.cartrans.set_rotation(math.cos(3 * pos)) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None diff --git a/src/gym/envs/classic_control/mountain_car.py b/src/gym/envs/classic_control/mountain_car.py new file mode 100644 index 0000000..40b5a63 --- /dev/null +++ b/src/gym/envs/classic_control/mountain_car.py @@ -0,0 +1,119 @@ +""" +http://incompleteideas.net/sutton/MountainCar/MountainCar1.cp +permalink: https://perma.cc/6Z2N-PFWC +""" + +import math +import gym +from gym import spaces +from gym.utils import seeding +import numpy as np + +class MountainCarEnv(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + + def __init__(self): + self.min_position = -1.2 + self.max_position = 0.6 + self.max_speed = 0.07 + self.goal_position = 0.5 + + self.low = np.array([self.min_position, -self.max_speed]) + self.high = np.array([self.max_position, self.max_speed]) + + self.viewer = None + + self.action_space = spaces.Discrete(3) + self.observation_space = spaces.Box(self.low, self.high) + + self.seed() + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) + + position, velocity = self.state + velocity += (action-1)*0.001 + math.cos(3*position)*(-0.0025) + velocity = np.clip(velocity, -self.max_speed, self.max_speed) + position += velocity + position = np.clip(position, self.min_position, self.max_position) + if (position==self.min_position and velocity<0): velocity = 0 + + done = bool(position >= self.goal_position) + reward = -1.0 + + self.state = (position, velocity) + return np.array(self.state), reward, done, {} + + def reset(self): + self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0]) + return np.array(self.state) + + def _height(self, xs): + return np.sin(3 * xs)*.45+.55 + + def render(self, mode='human'): + screen_width = 600 + screen_height = 400 + + world_width = self.max_position - self.min_position + scale = screen_width/world_width + carwidth=40 + carheight=20 + + + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(screen_width, screen_height) + xs = np.linspace(self.min_position, self.max_position, 100) + ys = self._height(xs) + xys = list(zip((xs-self.min_position)*scale, ys*scale)) + + self.track = rendering.make_polyline(xys) + self.track.set_linewidth(4) + self.viewer.add_geom(self.track) + + clearance = 10 + + l,r,t,b = -carwidth/2, carwidth/2, carheight, 0 + car = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)]) + car.add_attr(rendering.Transform(translation=(0, clearance))) + self.cartrans = rendering.Transform() + car.add_attr(self.cartrans) + self.viewer.add_geom(car) + frontwheel = rendering.make_circle(carheight/2.5) + frontwheel.set_color(.5, .5, .5) + frontwheel.add_attr(rendering.Transform(translation=(carwidth/4,clearance))) + frontwheel.add_attr(self.cartrans) + self.viewer.add_geom(frontwheel) + backwheel = rendering.make_circle(carheight/2.5) + backwheel.add_attr(rendering.Transform(translation=(-carwidth/4,clearance))) + backwheel.add_attr(self.cartrans) + backwheel.set_color(.5, .5, .5) + self.viewer.add_geom(backwheel) + flagx = (self.goal_position-self.min_position)*scale + flagy1 = self._height(self.goal_position)*scale + flagy2 = flagy1 + 50 + flagpole = rendering.Line((flagx, flagy1), (flagx, flagy2)) + self.viewer.add_geom(flagpole) + flag = rendering.FilledPolygon([(flagx, flagy2), (flagx, flagy2-10), (flagx+25, flagy2-5)]) + flag.set_color(.8,.8,0) + self.viewer.add_geom(flag) + + pos = self.state[0] + self.cartrans.set_translation((pos-self.min_position)*scale, self._height(pos)*scale) + self.cartrans.set_rotation(math.cos(3 * pos)) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None diff --git a/src/gym/envs/classic_control/pendulum.py b/src/gym/envs/classic_control/pendulum.py new file mode 100644 index 0000000..940c2e1 --- /dev/null +++ b/src/gym/envs/classic_control/pendulum.py @@ -0,0 +1,90 @@ +import gym +from gym import spaces +from gym.utils import seeding +import numpy as np +from os import path + +class PendulumEnv(gym.Env): + metadata = { + 'render.modes' : ['human', 'rgb_array'], + 'video.frames_per_second' : 30 + } + + def __init__(self): + self.max_speed=8 + self.max_torque=2. + self.dt=.05 + self.viewer = None + + high = np.array([1., 1., self.max_speed]) + self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32) + self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32) + + self.seed() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self,u): + th, thdot = self.state # th := theta + + g = 10. + m = 1. + l = 1. + dt = self.dt + + u = np.clip(u, -self.max_torque, self.max_torque)[0] + self.last_u = u # for rendering + costs = angle_normalize(th)**2 + .1*thdot**2 + .001*(u**2) + + newthdot = thdot + (-3*g/(2*l) * np.sin(th + np.pi) + 3./(m*l**2)*u) * dt + newth = th + newthdot*dt + newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) #pylint: disable=E1111 + + self.state = np.array([newth, newthdot]) + return self._get_obs(), -costs, False, {} + + def reset(self): + high = np.array([np.pi, 1]) + self.state = self.np_random.uniform(low=-high, high=high) + self.last_u = None + return self._get_obs() + + def _get_obs(self): + theta, thetadot = self.state + return np.array([np.cos(theta), np.sin(theta), thetadot]) + + def render(self, mode='human'): + + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(500,500) + self.viewer.set_bounds(-2.2,2.2,-2.2,2.2) + rod = rendering.make_capsule(1, .2) + rod.set_color(.8, .3, .3) + self.pole_transform = rendering.Transform() + rod.add_attr(self.pole_transform) + self.viewer.add_geom(rod) + axle = rendering.make_circle(.05) + axle.set_color(0,0,0) + self.viewer.add_geom(axle) + fname = path.join(path.dirname(__file__), "assets/clockwise.png") + self.img = rendering.Image(fname, 1., 1.) + self.imgtrans = rendering.Transform() + self.img.add_attr(self.imgtrans) + + self.viewer.add_onetime(self.img) + self.pole_transform.set_rotation(self.state[0] + np.pi/2) + if self.last_u: + self.imgtrans.scale = (-self.last_u/2, np.abs(self.last_u)/2) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None + +def angle_normalize(x): + return (((x+np.pi) % (2*np.pi)) - np.pi) diff --git a/src/gym/envs/classic_control/rendering.py b/src/gym/envs/classic_control/rendering.py new file mode 100644 index 0000000..4fa23b2 --- /dev/null +++ b/src/gym/envs/classic_control/rendering.py @@ -0,0 +1,359 @@ +""" +2D rendering framework +""" +from __future__ import division +import os +import six +import sys + +if "Apple" in sys.version: + if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: + os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' + # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite + +from gym.utils import reraise +from gym import error + +try: + import pyglet +except ImportError as e: + reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") + +try: + from pyglet.gl import * +except ImportError as e: + reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") + +import math +import numpy as np + +RAD2DEG = 57.29577951308232 + +def get_display(spec): + """Convert a display specification (such as :0) into an actual Display + object. + + Pyglet only supports multiple Displays on Linux. + """ + if spec is None: + return None + elif isinstance(spec, six.string_types): + return pyglet.canvas.Display(spec) + else: + raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) + +class Viewer(object): + def __init__(self, width, height, display=None): + display = get_display(display) + + self.width = width + self.height = height + self.window = pyglet.window.Window(width=width, height=height, display=display) + self.window.on_close = self.window_closed_by_user + self.isopen = True + self.geoms = [] + self.onetime_geoms = [] + self.transform = Transform() + + glEnable(GL_BLEND) + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) + + def close(self): + self.window.close() + + def window_closed_by_user(self): + self.isopen = False + + def set_bounds(self, left, right, bottom, top): + assert right > left and top > bottom + scalex = self.width/(right-left) + scaley = self.height/(top-bottom) + self.transform = Transform( + translation=(-left*scalex, -bottom*scaley), + scale=(scalex, scaley)) + + def add_geom(self, geom): + self.geoms.append(geom) + + def add_onetime(self, geom): + self.onetime_geoms.append(geom) + + def render(self, return_rgb_array=False): + glClearColor(1,1,1,1) + self.window.clear() + self.window.switch_to() + self.window.dispatch_events() + self.transform.enable() + for geom in self.geoms: + geom.render() + for geom in self.onetime_geoms: + geom.render() + self.transform.disable() + arr = None + if return_rgb_array: + buffer = pyglet.image.get_buffer_manager().get_color_buffer() + image_data = buffer.get_image_data() + arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') + # In https://github.com/openai/gym-http-api/issues/2, we + # discovered that someone using Xmonad on Arch was having + # a window of size 598 x 398, though a 600 x 400 window + # was requested. (Guess Xmonad was preserving a pixel for + # the boundary.) So we use the buffer height/width rather + # than the requested one. + arr = arr.reshape(buffer.height, buffer.width, 4) + arr = arr[::-1,:,0:3] + self.window.flip() + self.onetime_geoms = [] + return arr if return_rgb_array else self.isopen + + # Convenience + def draw_circle(self, radius=10, res=30, filled=True, **attrs): + geom = make_circle(radius=radius, res=res, filled=filled) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def draw_polygon(self, v, filled=True, **attrs): + geom = make_polygon(v=v, filled=filled) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def draw_polyline(self, v, **attrs): + geom = make_polyline(v=v) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def draw_line(self, start, end, **attrs): + geom = Line(start, end) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def get_array(self): + self.window.flip() + image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() + self.window.flip() + arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') + arr = arr.reshape(self.height, self.width, 4) + return arr[::-1,:,0:3] + + def __del__(self): + self.close() + +def _add_attrs(geom, attrs): + if "color" in attrs: + geom.set_color(*attrs["color"]) + if "linewidth" in attrs: + geom.set_linewidth(attrs["linewidth"]) + +class Geom(object): + def __init__(self): + self._color=Color((0, 0, 0, 1.0)) + self.attrs = [self._color] + def render(self): + for attr in reversed(self.attrs): + attr.enable() + self.render1() + for attr in self.attrs: + attr.disable() + def render1(self): + raise NotImplementedError + def add_attr(self, attr): + self.attrs.append(attr) + def set_color(self, r, g, b): + self._color.vec4 = (r, g, b, 1) + +class Attr(object): + def enable(self): + raise NotImplementedError + def disable(self): + pass + +class Transform(Attr): + def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): + self.set_translation(*translation) + self.set_rotation(rotation) + self.set_scale(*scale) + def enable(self): + glPushMatrix() + glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint + glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) + glScalef(self.scale[0], self.scale[1], 1) + def disable(self): + glPopMatrix() + def set_translation(self, newx, newy): + self.translation = (float(newx), float(newy)) + def set_rotation(self, new): + self.rotation = float(new) + def set_scale(self, newx, newy): + self.scale = (float(newx), float(newy)) + +class Color(Attr): + def __init__(self, vec4): + self.vec4 = vec4 + def enable(self): + glColor4f(*self.vec4) + +class LineStyle(Attr): + def __init__(self, style): + self.style = style + def enable(self): + glEnable(GL_LINE_STIPPLE) + glLineStipple(1, self.style) + def disable(self): + glDisable(GL_LINE_STIPPLE) + +class LineWidth(Attr): + def __init__(self, stroke): + self.stroke = stroke + def enable(self): + glLineWidth(self.stroke) + +class Point(Geom): + def __init__(self): + Geom.__init__(self) + def render1(self): + glBegin(GL_POINTS) # draw point + glVertex3f(0.0, 0.0, 0.0) + glEnd() + +class FilledPolygon(Geom): + def __init__(self, v): + Geom.__init__(self) + self.v = v + def render1(self): + if len(self.v) == 4 : glBegin(GL_QUADS) + elif len(self.v) > 4 : glBegin(GL_POLYGON) + else: glBegin(GL_TRIANGLES) + for p in self.v: + glVertex3f(p[0], p[1],0) # draw each vertex + glEnd() + +def make_circle(radius=10, res=30, filled=True): + points = [] + for i in range(res): + ang = 2*math.pi*i / res + points.append((math.cos(ang)*radius, math.sin(ang)*radius)) + if filled: + return FilledPolygon(points) + else: + return PolyLine(points, True) + +def make_polygon(v, filled=True): + if filled: return FilledPolygon(v) + else: return PolyLine(v, True) + +def make_polyline(v): + return PolyLine(v, False) + +def make_capsule(length, width): + l, r, t, b = 0, length, width/2, -width/2 + box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) + circ0 = make_circle(width/2) + circ1 = make_circle(width/2) + circ1.add_attr(Transform(translation=(length, 0))) + geom = Compound([box, circ0, circ1]) + return geom + +class Compound(Geom): + def __init__(self, gs): + Geom.__init__(self) + self.gs = gs + for g in self.gs: + g.attrs = [a for a in g.attrs if not isinstance(a, Color)] + def render1(self): + for g in self.gs: + g.render() + +class PolyLine(Geom): + def __init__(self, v, close): + Geom.__init__(self) + self.v = v + self.close = close + self.linewidth = LineWidth(1) + self.add_attr(self.linewidth) + def render1(self): + glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) + for p in self.v: + glVertex3f(p[0], p[1],0) # draw each vertex + glEnd() + def set_linewidth(self, x): + self.linewidth.stroke = x + +class Line(Geom): + def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): + Geom.__init__(self) + self.start = start + self.end = end + self.linewidth = LineWidth(1) + self.add_attr(self.linewidth) + + def render1(self): + glBegin(GL_LINES) + glVertex2f(*self.start) + glVertex2f(*self.end) + glEnd() + +class Image(Geom): + def __init__(self, fname, width, height): + Geom.__init__(self) + self.width = width + self.height = height + img = pyglet.image.load(fname) + self.img = img + self.flip = False + def render1(self): + self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) + +# ================================================================ + +class SimpleImageViewer(object): + def __init__(self, display=None, maxwidth=500): + self.window = None + self.isopen = False + self.display = display + self.maxwidth = maxwidth + def imshow(self, arr): + if self.window is None: + height, width, _channels = arr.shape + if width > self.maxwidth: + scale = self.maxwidth / width + width = int(scale * width) + height = int(scale * height) + self.window = pyglet.window.Window(width=width, height=height, + display=self.display, vsync=False, resizable=True) + self.width = width + self.height = height + self.isopen = True + + @self.window.event + def on_resize(width, height): + self.width = width + self.height = height + + @self.window.event + def on_close(): + self.isopen = False + + assert len(arr.shape) == 3, "You passed in an image with the wrong number shape" + image = pyglet.image.ImageData(arr.shape[1], arr.shape[0], + 'RGB', arr.tobytes(), pitch=arr.shape[1]*-3) + gl.glTexParameteri(gl.GL_TEXTURE_2D, + gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + texture = image.get_texture() + texture.width = self.width + texture.height = self.height + self.window.clear() + self.window.switch_to() + self.window.dispatch_events() + texture.blit(0, 0) # draw + self.window.flip() + def close(self): + if self.isopen: + self.window.close() + self.isopen = False + + def __del__(self): + self.close() diff --git a/src/gym/envs/mujoco/__init__.py b/src/gym/envs/mujoco/__init__.py new file mode 100644 index 0000000..ec1e3b0 --- /dev/null +++ b/src/gym/envs/mujoco/__init__.py @@ -0,0 +1,16 @@ +from gym.envs.mujoco.mujoco_env import MujocoEnv +# ^^^^^ so that user gets the correct error +# message if mujoco is not installed correctly +from gym.envs.mujoco.ant import AntEnv +from gym.envs.mujoco.half_cheetah import HalfCheetahEnv +from gym.envs.mujoco.hopper import HopperEnv +from gym.envs.mujoco.walker2d import Walker2dEnv +from gym.envs.mujoco.humanoid import HumanoidEnv +from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv +from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv +from gym.envs.mujoco.reacher import ReacherEnv +from gym.envs.mujoco.swimmer import SwimmerEnv +from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv +from gym.envs.mujoco.pusher import PusherEnv +from gym.envs.mujoco.thrower import ThrowerEnv +from gym.envs.mujoco.striker import StrikerEnv diff --git a/src/gym/envs/mujoco/ant.py b/src/gym/envs/mujoco/ant.py new file mode 100644 index 0000000..550fb64 --- /dev/null +++ b/src/gym/envs/mujoco/ant.py @@ -0,0 +1,45 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5) + utils.EzPickle.__init__(self) + + def step(self, a): + xposbefore = self.get_body_com("torso")[0] + self.do_simulation(a, self.frame_skip) + xposafter = self.get_body_com("torso")[0] + forward_reward = (xposafter - xposbefore)/self.dt + ctrl_cost = .5 * np.square(a).sum() + contact_cost = 0.5 * 1e-3 * np.sum( + np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) + survive_reward = 1.0 + reward = forward_reward - ctrl_cost - contact_cost + survive_reward + state = self.state_vector() + notdone = np.isfinite(state).all() \ + and state[2] >= 0.2 and state[2] <= 1.0 + done = not notdone + ob = self._get_obs() + return ob, reward, done, dict( + reward_forward=forward_reward, + reward_ctrl=-ctrl_cost, + reward_contact=-contact_cost, + reward_survive=survive_reward) + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos.flat[2:], + self.sim.data.qvel.flat, + np.clip(self.sim.data.cfrc_ext, -1, 1).flat, + ]) + + def reset_model(self): + qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) + qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 + self.set_state(qpos, qvel) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.distance = self.model.stat.extent * 0.5 diff --git a/src/gym/envs/mujoco/assets/ant.xml b/src/gym/envs/mujoco/assets/ant.xml new file mode 100644 index 0000000..ee4d679 --- /dev/null +++ b/src/gym/envs/mujoco/assets/ant.xml @@ -0,0 +1,81 @@ + + + diff --git a/src/gym/envs/mujoco/assets/half_cheetah.xml b/src/gym/envs/mujoco/assets/half_cheetah.xml new file mode 100644 index 0000000..338c2e8 --- /dev/null +++ b/src/gym/envs/mujoco/assets/half_cheetah.xml @@ -0,0 +1,96 @@ + + + + + + + + + + diff --git a/src/gym/envs/mujoco/assets/hopper.xml b/src/gym/envs/mujoco/assets/hopper.xml new file mode 100644 index 0000000..f18bc46 --- /dev/null +++ b/src/gym/envs/mujoco/assets/hopper.xml @@ -0,0 +1,48 @@ + + + + + + + + diff --git a/src/gym/envs/mujoco/assets/humanoid.xml b/src/gym/envs/mujoco/assets/humanoid.xml new file mode 100755 index 0000000..db19261 --- /dev/null +++ b/src/gym/envs/mujoco/assets/humanoid.xml @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/mujoco/assets/humanoidstandup.xml b/src/gym/envs/mujoco/assets/humanoidstandup.xml new file mode 100755 index 0000000..854dbc9 --- /dev/null +++ b/src/gym/envs/mujoco/assets/humanoidstandup.xml @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/mujoco/assets/inverted_double_pendulum.xml b/src/gym/envs/mujoco/assets/inverted_double_pendulum.xml new file mode 100644 index 0000000..a274e8c --- /dev/null +++ b/src/gym/envs/mujoco/assets/inverted_double_pendulum.xml @@ -0,0 +1,47 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/src/gym/envs/mujoco/assets/inverted_pendulum.xml b/src/gym/envs/mujoco/assets/inverted_pendulum.xml new file mode 100644 index 0000000..b778ba0 --- /dev/null +++ b/src/gym/envs/mujoco/assets/inverted_pendulum.xml @@ -0,0 +1,27 @@ + + + + + + + + + \ No newline at end of file diff --git a/src/gym/envs/mujoco/assets/point.xml b/src/gym/envs/mujoco/assets/point.xml new file mode 100644 index 0000000..e35ef3d --- /dev/null +++ b/src/gym/envs/mujoco/assets/point.xml @@ -0,0 +1,31 @@ + + + diff --git a/src/gym/envs/mujoco/assets/pusher.xml b/src/gym/envs/mujoco/assets/pusher.xml new file mode 100644 index 0000000..31a5ef7 --- /dev/null +++ b/src/gym/envs/mujoco/assets/pusher.xml @@ -0,0 +1,91 @@ + + + diff --git a/src/gym/envs/mujoco/assets/reacher.xml b/src/gym/envs/mujoco/assets/reacher.xml new file mode 100644 index 0000000..28ebd67 --- /dev/null +++ b/src/gym/envs/mujoco/assets/reacher.xml @@ -0,0 +1,39 @@ + + + + + + + \ No newline at end of file diff --git a/src/gym/envs/mujoco/assets/striker.xml b/src/gym/envs/mujoco/assets/striker.xml new file mode 100644 index 0000000..f66f808 --- /dev/null +++ b/src/gym/envs/mujoco/assets/striker.xml @@ -0,0 +1,101 @@ + + + diff --git a/src/gym/envs/mujoco/assets/swimmer.xml b/src/gym/envs/mujoco/assets/swimmer.xml new file mode 100644 index 0000000..cda25da --- /dev/null +++ b/src/gym/envs/mujoco/assets/swimmer.xml @@ -0,0 +1,38 @@ + + + diff --git a/src/gym/envs/mujoco/assets/thrower.xml b/src/gym/envs/mujoco/assets/thrower.xml new file mode 100644 index 0000000..b68f256 --- /dev/null +++ b/src/gym/envs/mujoco/assets/thrower.xml @@ -0,0 +1,127 @@ + + + diff --git a/src/gym/envs/mujoco/assets/walker2d.xml b/src/gym/envs/mujoco/assets/walker2d.xml new file mode 100644 index 0000000..3342571 --- /dev/null +++ b/src/gym/envs/mujoco/assets/walker2d.xml @@ -0,0 +1,62 @@ + + + + + + + diff --git a/src/gym/envs/mujoco/half_cheetah.py b/src/gym/envs/mujoco/half_cheetah.py new file mode 100644 index 0000000..ea9761c --- /dev/null +++ b/src/gym/envs/mujoco/half_cheetah.py @@ -0,0 +1,34 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5) + utils.EzPickle.__init__(self) + + def step(self, action): + xposbefore = self.sim.data.qpos[0] + self.do_simulation(action, self.frame_skip) + xposafter = self.sim.data.qpos[0] + ob = self._get_obs() + reward_ctrl = - 0.1 * np.square(action).sum() + reward_run = (xposafter - xposbefore)/self.dt + reward = reward_ctrl + reward_run + done = False + return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos.flat[1:], + self.sim.data.qvel.flat, + ]) + + def reset_model(self): + qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) + qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 + self.set_state(qpos, qvel) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.distance = self.model.stat.extent * 0.5 diff --git a/src/gym/envs/mujoco/hopper.py b/src/gym/envs/mujoco/hopper.py new file mode 100644 index 0000000..be826a4 --- /dev/null +++ b/src/gym/envs/mujoco/hopper.py @@ -0,0 +1,40 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4) + utils.EzPickle.__init__(self) + + def step(self, a): + posbefore = self.sim.data.qpos[0] + self.do_simulation(a, self.frame_skip) + posafter, height, ang = self.sim.data.qpos[0:3] + alive_bonus = 1.0 + reward = (posafter - posbefore) / self.dt + reward += alive_bonus + reward -= 1e-3 * np.square(a).sum() + s = self.state_vector() + done = not (np.isfinite(s).all() and (np.abs(s[2:]) < 100).all() and + (height > .7) and (abs(ang) < .2)) + ob = self._get_obs() + return ob, reward, done, {} + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos.flat[1:], + np.clip(self.sim.data.qvel.flat, -10, 10) + ]) + + def reset_model(self): + qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq) + qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) + self.set_state(qpos, qvel) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 2 + self.viewer.cam.distance = self.model.stat.extent * 0.75 + self.viewer.cam.lookat[2] = 1.15 + self.viewer.cam.elevation = -20 diff --git a/src/gym/envs/mujoco/humanoid.py b/src/gym/envs/mujoco/humanoid.py new file mode 100644 index 0000000..021733e --- /dev/null +++ b/src/gym/envs/mujoco/humanoid.py @@ -0,0 +1,51 @@ +import numpy as np +from gym.envs.mujoco import mujoco_env +from gym import utils + +def mass_center(model, sim): + mass = np.expand_dims(model.body_mass, 1) + xpos = sim.data.xipos + return (np.sum(mass * xpos, 0) / np.sum(mass))[0] + +class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) + utils.EzPickle.__init__(self) + + def _get_obs(self): + data = self.sim.data + return np.concatenate([data.qpos.flat[2:], + data.qvel.flat, + data.cinert.flat, + data.cvel.flat, + data.qfrc_actuator.flat, + data.cfrc_ext.flat]) + + def step(self, a): + pos_before = mass_center(self.model, self.sim) + self.do_simulation(a, self.frame_skip) + pos_after = mass_center(self.model, self.sim) + alive_bonus = 5.0 + data = self.sim.data + lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep + quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() + quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() + quad_impact_cost = min(quad_impact_cost, 10) + reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus + qpos = self.sim.data.qpos + done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) + return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) + + def reset_model(self): + c = 0.01 + self.set_state( + self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), + self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) + ) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 1 + self.viewer.cam.distance = self.model.stat.extent * 1.0 + self.viewer.cam.lookat[2] = 2.0 + self.viewer.cam.elevation = -20 diff --git a/src/gym/envs/mujoco/humanoidstandup.py b/src/gym/envs/mujoco/humanoidstandup.py new file mode 100644 index 0000000..6ab595b --- /dev/null +++ b/src/gym/envs/mujoco/humanoidstandup.py @@ -0,0 +1,45 @@ +from gym.envs.mujoco import mujoco_env +from gym import utils +import numpy as np + +class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'humanoidstandup.xml', 5) + utils.EzPickle.__init__(self) + + def _get_obs(self): + data = self.sim.data + return np.concatenate([data.qpos.flat[2:], + data.qvel.flat, + data.cinert.flat, + data.cvel.flat, + data.qfrc_actuator.flat, + data.cfrc_ext.flat]) + + def step(self, a): + self.do_simulation(a, self.frame_skip) + pos_after = self.sim.data.qpos[2] + data = self.sim.data + uph_cost = (pos_after - 0) / self.model.opt.timestep + + quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() + quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() + quad_impact_cost = min(quad_impact_cost, 10) + reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1 + + done = bool(False) + return self._get_obs(), reward, done, dict(reward_linup=uph_cost, reward_quadctrl=-quad_ctrl_cost, reward_impact=-quad_impact_cost) + + def reset_model(self): + c = 0.01 + self.set_state( + self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), + self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) + ) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 1 + self.viewer.cam.distance = self.model.stat.extent * 1.0 + self.viewer.cam.lookat[2] = 0.8925 + self.viewer.cam.elevation = -20 diff --git a/src/gym/envs/mujoco/inverted_double_pendulum.py b/src/gym/envs/mujoco/inverted_double_pendulum.py new file mode 100644 index 0000000..f461066 --- /dev/null +++ b/src/gym/envs/mujoco/inverted_double_pendulum.py @@ -0,0 +1,43 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): + + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'inverted_double_pendulum.xml', 5) + utils.EzPickle.__init__(self) + + def step(self, action): + self.do_simulation(action, self.frame_skip) + ob = self._get_obs() + x, _, y = self.sim.data.site_xpos[0] + dist_penalty = 0.01 * x ** 2 + (y - 2) ** 2 + v1, v2 = self.sim.data.qvel[1:3] + vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 + alive_bonus = 10 + r = alive_bonus - dist_penalty - vel_penalty + done = bool(y <= 1) + return ob, r, done, {} + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos[:1], # cart x pos + np.sin(self.sim.data.qpos[1:]), # link angles + np.cos(self.sim.data.qpos[1:]), + np.clip(self.sim.data.qvel, -10, 10), + np.clip(self.sim.data.qfrc_constraint, -10, 10) + ]).ravel() + + def reset_model(self): + self.set_state( + self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq), + self.init_qvel + self.np_random.randn(self.model.nv) * .1 + ) + return self._get_obs() + + def viewer_setup(self): + v = self.viewer + v.cam.trackbodyid = 0 + v.cam.distance = self.model.stat.extent * 0.5 + v.cam.lookat[2] = 0.12250000000000005 # v.model.stat.center[2] diff --git a/src/gym/envs/mujoco/inverted_pendulum.py b/src/gym/envs/mujoco/inverted_pendulum.py new file mode 100644 index 0000000..b2ed6bd --- /dev/null +++ b/src/gym/envs/mujoco/inverted_pendulum.py @@ -0,0 +1,30 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + utils.EzPickle.__init__(self) + mujoco_env.MujocoEnv.__init__(self, 'inverted_pendulum.xml', 2) + + def step(self, a): + reward = 1.0 + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= .2) + done = not notdone + return ob, reward, done, {} + + def reset_model(self): + qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-0.01, high=0.01) + qvel = self.init_qvel + self.np_random.uniform(size=self.model.nv, low=-0.01, high=0.01) + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel() + + def viewer_setup(self): + v = self.viewer + v.cam.trackbodyid = 0 + v.cam.distance = self.model.stat.extent diff --git a/src/gym/envs/mujoco/mujoco_env.py b/src/gym/envs/mujoco/mujoco_env.py new file mode 100644 index 0000000..11cf9ba --- /dev/null +++ b/src/gym/envs/mujoco/mujoco_env.py @@ -0,0 +1,145 @@ +import os + +from gym import error, spaces +from gym.utils import seeding +import numpy as np +from os import path +import gym +import six + +try: + import mujoco_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) + +DEFAULT_SIZE = 500 + +class MujocoEnv(gym.Env): + """Superclass for all MuJoCo environments. + """ + + def __init__(self, model_path, frame_skip): + if model_path.startswith("/"): + fullpath = model_path + else: + fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path) + if not path.exists(fullpath): + raise IOError("File %s does not exist" % fullpath) + self.frame_skip = frame_skip + self.model = mujoco_py.load_model_from_path(fullpath) + self.sim = mujoco_py.MjSim(self.model) + self.data = self.sim.data + self.viewer = None + self._viewers = {} + + self.metadata = { + 'render.modes': ['human', 'rgb_array', 'depth_array'], + 'video.frames_per_second': int(np.round(1.0 / self.dt)) + } + + self.init_qpos = self.sim.data.qpos.ravel().copy() + self.init_qvel = self.sim.data.qvel.ravel().copy() + observation, _reward, done, _info = self.step(np.zeros(self.model.nu)) + assert not done + self.obs_dim = observation.size + + bounds = self.model.actuator_ctrlrange.copy() + low = bounds[:, 0] + high = bounds[:, 1] + self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) + + high = np.inf*np.ones(self.obs_dim) + low = -high + self.observation_space = spaces.Box(low, high, dtype=np.float32) + + self.seed() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + # methods to override: + # ---------------------------- + + def reset_model(self): + """ + Reset the robot degrees of freedom (qpos and qvel). + Implement this in each subclass. + """ + raise NotImplementedError + + def viewer_setup(self): + """ + This method is called when the viewer is initialized. + Optionally implement this method, if you need to tinker with camera position + and so forth. + """ + pass + + # ----------------------------- + + def reset(self): + self.sim.reset() + ob = self.reset_model() + return ob + + def set_state(self, qpos, qvel): + assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) + old_state = self.sim.get_state() + new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, + old_state.act, old_state.udd_state) + self.sim.set_state(new_state) + self.sim.forward() + + @property + def dt(self): + return self.model.opt.timestep * self.frame_skip + + def do_simulation(self, ctrl, n_frames): + self.sim.data.ctrl[:] = ctrl + for _ in range(n_frames): + self.sim.step() + + def render(self, mode='human', width=DEFAULT_SIZE, height=DEFAULT_SIZE): + if mode == 'rgb_array': + self._get_viewer(mode).render(width, height) + # window size used for old mujoco-py: + data = self._get_viewer(mode).read_pixels(width, height, depth=False) + # original image is upside-down, so flip it + return data[::-1, :, :] + elif mode == 'depth_array': + self._get_viewer(mode).render(width, height) + # window size used for old mujoco-py: + # Extract depth part of the read_pixels() tuple + data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1] + # original image is upside-down, so flip it + return data[::-1, :] + elif mode == 'human': + self._get_viewer(mode).render() + + def close(self): + if self.viewer is not None: + # self.viewer.finish() + self.viewer = None + self._viewers = {} + + def _get_viewer(self, mode): + self.viewer = self._viewers.get(mode) + if self.viewer is None: + if mode == 'human': + self.viewer = mujoco_py.MjViewer(self.sim) + elif mode == 'rgb_array' or mode == 'depth_array': + self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1) + + self.viewer_setup() + self._viewers[mode] = self.viewer + return self.viewer + + def get_body_com(self, body_name): + return self.data.get_body_xpos(body_name) + + def state_vector(self): + return np.concatenate([ + self.sim.data.qpos.flat, + self.sim.data.qvel.flat + ]) diff --git a/src/gym/envs/mujoco/pusher.py b/src/gym/envs/mujoco/pusher.py new file mode 100644 index 0000000..78670e6 --- /dev/null +++ b/src/gym/envs/mujoco/pusher.py @@ -0,0 +1,57 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +import mujoco_py + +class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + utils.EzPickle.__init__(self) + mujoco_env.MujocoEnv.__init__(self, 'pusher.xml', 5) + + def step(self, a): + vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") + vec_2 = self.get_body_com("object") - self.get_body_com("goal") + + reward_near = - np.linalg.norm(vec_1) + reward_dist = - np.linalg.norm(vec_2) + reward_ctrl = - np.square(a).sum() + reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near + + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(reward_dist=reward_dist, + reward_ctrl=reward_ctrl) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = -1 + self.viewer.cam.distance = 4.0 + + def reset_model(self): + qpos = self.init_qpos + + self.goal_pos = np.asarray([0, 0]) + while True: + self.cylinder_pos = np.concatenate([ + self.np_random.uniform(low=-0.3, high=0, size=1), + self.np_random.uniform(low=-0.2, high=0.2, size=1)]) + if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17: + break + + qpos[-4:-2] = self.cylinder_pos + qpos[-2:] = self.goal_pos + qvel = self.init_qvel + self.np_random.uniform(low=-0.005, + high=0.005, size=self.model.nv) + qvel[-4:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos.flat[:7], + self.sim.data.qvel.flat[:7], + self.get_body_com("tips_arm"), + self.get_body_com("object"), + self.get_body_com("goal"), + ]) diff --git a/src/gym/envs/mujoco/reacher.py b/src/gym/envs/mujoco/reacher.py new file mode 100644 index 0000000..53a16ab --- /dev/null +++ b/src/gym/envs/mujoco/reacher.py @@ -0,0 +1,43 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + utils.EzPickle.__init__(self) + mujoco_env.MujocoEnv.__init__(self, 'reacher.xml', 2) + + def step(self, a): + vec = self.get_body_com("fingertip")-self.get_body_com("target") + reward_dist = - np.linalg.norm(vec) + reward_ctrl = - np.square(a).sum() + reward = reward_dist + reward_ctrl + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 0 + + def reset_model(self): + qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos + while True: + self.goal = self.np_random.uniform(low=-.2, high=.2, size=2) + if np.linalg.norm(self.goal) < 2: + break + qpos[-2:] = self.goal + qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) + qvel[-2:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.sim.data.qpos.flat[:2] + return np.concatenate([ + np.cos(theta), + np.sin(theta), + self.sim.data.qpos.flat[2:], + self.sim.data.qvel.flat[:2], + self.get_body_com("fingertip") - self.get_body_com("target") + ]) diff --git a/src/gym/envs/mujoco/striker.py b/src/gym/envs/mujoco/striker.py new file mode 100644 index 0000000..2efb27e --- /dev/null +++ b/src/gym/envs/mujoco/striker.py @@ -0,0 +1,75 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class StrikerEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + utils.EzPickle.__init__(self) + self._striked = False + self._min_strike_dist = np.inf + self.strike_threshold = 0.1 + mujoco_env.MujocoEnv.__init__(self, 'striker.xml', 5) + + def step(self, a): + vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") + vec_2 = self.get_body_com("object") - self.get_body_com("goal") + self._min_strike_dist = min(self._min_strike_dist, np.linalg.norm(vec_2)) + + if np.linalg.norm(vec_1) < self.strike_threshold: + self._striked = True + self._strike_pos = self.get_body_com("tips_arm") + + if self._striked: + vec_3 = self.get_body_com("object") - self._strike_pos + reward_near = - np.linalg.norm(vec_3) + else: + reward_near = - np.linalg.norm(vec_1) + + reward_dist = - np.linalg.norm(self._min_strike_dist) + reward_ctrl = - np.square(a).sum() + reward = 3 * reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near + + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(reward_dist=reward_dist, + reward_ctrl=reward_ctrl) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 0 + self.viewer.cam.distance = 4.0 + + def reset_model(self): + self._min_strike_dist = np.inf + self._striked = False + self._strike_pos = None + + qpos = self.init_qpos + + self.ball = np.array([0.5, -0.175]) + while True: + self.goal = np.concatenate([ + self.np_random.uniform(low=0.15, high=0.7, size=1), + self.np_random.uniform(low=0.1, high=1.0, size=1)]) + if np.linalg.norm(self.ball - self.goal) > 0.17: + break + + qpos[-9:-7] = [self.ball[1], self.ball[0]] + qpos[-7:-5] = self.goal + diff = self.ball - self.goal + angle = -np.arctan(diff[0] / (diff[1] + 1e-8)) + qpos[-1] = angle / 3.14 + qvel = self.init_qvel + self.np_random.uniform(low=-.1, high=.1, + size=self.model.nv) + qvel[7:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos.flat[:7], + self.sim.data.qvel.flat[:7], + self.get_body_com("tips_arm"), + self.get_body_com("object"), + self.get_body_com("goal"), + ]) diff --git a/src/gym/envs/mujoco/swimmer.py b/src/gym/envs/mujoco/swimmer.py new file mode 100644 index 0000000..42574fa --- /dev/null +++ b/src/gym/envs/mujoco/swimmer.py @@ -0,0 +1,31 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 4) + utils.EzPickle.__init__(self) + + def step(self, a): + ctrl_cost_coeff = 0.0001 + xposbefore = self.sim.data.qpos[0] + self.do_simulation(a, self.frame_skip) + xposafter = self.sim.data.qpos[0] + reward_fwd = (xposafter - xposbefore) / self.dt + reward_ctrl = - ctrl_cost_coeff * np.square(a).sum() + reward = reward_fwd + reward_ctrl + ob = self._get_obs() + return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl) + + def _get_obs(self): + qpos = self.sim.data.qpos + qvel = self.sim.data.qvel + return np.concatenate([qpos.flat[2:], qvel.flat]) + + def reset_model(self): + self.set_state( + self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq), + self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv) + ) + return self._get_obs() diff --git a/src/gym/envs/mujoco/thrower.py b/src/gym/envs/mujoco/thrower.py new file mode 100644 index 0000000..1e8ca8d --- /dev/null +++ b/src/gym/envs/mujoco/thrower.py @@ -0,0 +1,60 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class ThrowerEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + utils.EzPickle.__init__(self) + self._ball_hit_ground = False + self._ball_hit_location = None + mujoco_env.MujocoEnv.__init__(self, 'thrower.xml', 5) + + def step(self, a): + ball_xy = self.get_body_com("ball")[:2] + goal_xy = self.get_body_com("goal")[:2] + + if not self._ball_hit_ground and self.get_body_com("ball")[2] < -0.25: + self._ball_hit_ground = True + self._ball_hit_location = self.get_body_com("ball") + + if self._ball_hit_ground: + ball_hit_xy = self._ball_hit_location[:2] + reward_dist = -np.linalg.norm(ball_hit_xy - goal_xy) + else: + reward_dist = -np.linalg.norm(ball_xy - goal_xy) + reward_ctrl = - np.square(a).sum() + + reward = reward_dist + 0.002 * reward_ctrl + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(reward_dist=reward_dist, + reward_ctrl=reward_ctrl) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 0 + self.viewer.cam.distance = 4.0 + + def reset_model(self): + self._ball_hit_ground = False + self._ball_hit_location = None + + qpos = self.init_qpos + self.goal = np.array([self.np_random.uniform(low=-0.3, high=0.3), + self.np_random.uniform(low=-0.3, high=0.3)]) + + qpos[-9:-7] = self.goal + qvel = self.init_qvel + self.np_random.uniform(low=-0.005, + high=0.005, size=self.model.nv) + qvel[7:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([ + self.sim.data.qpos.flat[:7], + self.sim.data.qvel.flat[:7], + self.get_body_com("r_wrist_roll_link"), + self.get_body_com("ball"), + self.get_body_com("goal"), + ]) diff --git a/src/gym/envs/mujoco/walker2d.py b/src/gym/envs/mujoco/walker2d.py new file mode 100644 index 0000000..805f2dd --- /dev/null +++ b/src/gym/envs/mujoco/walker2d.py @@ -0,0 +1,40 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): + + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4) + utils.EzPickle.__init__(self) + + def step(self, a): + posbefore = self.sim.data.qpos[0] + self.do_simulation(a, self.frame_skip) + posafter, height, ang = self.sim.data.qpos[0:3] + alive_bonus = 1.0 + reward = ((posafter - posbefore) / self.dt) + reward += alive_bonus + reward -= 1e-3 * np.square(a).sum() + done = not (height > 0.8 and height < 2.0 and + ang > -1.0 and ang < 1.0) + ob = self._get_obs() + return ob, reward, done, {} + + def _get_obs(self): + qpos = self.sim.data.qpos + qvel = self.sim.data.qvel + return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel() + + def reset_model(self): + self.set_state( + self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), + self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) + ) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 2 + self.viewer.cam.distance = self.model.stat.extent * 0.5 + self.viewer.cam.lookat[2] = 1.15 + self.viewer.cam.elevation = -20 diff --git a/src/gym/envs/pilesos/__init__.py b/src/gym/envs/pilesos/__init__.py new file mode 100644 index 0000000..be7e1ab --- /dev/null +++ b/src/gym/envs/pilesos/__init__.py @@ -0,0 +1 @@ +from gym.envs.pilesos.pilesos import PilesosEnv diff --git a/src/gym/envs/pilesos/__pycache__/__init__.cpython-37.pyc b/src/gym/envs/pilesos/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..14f3991 Binary files /dev/null and b/src/gym/envs/pilesos/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/envs/pilesos/__pycache__/pilesos.cpython-37.pyc b/src/gym/envs/pilesos/__pycache__/pilesos.cpython-37.pyc new file mode 100644 index 0000000..8585697 Binary files /dev/null and b/src/gym/envs/pilesos/__pycache__/pilesos.cpython-37.pyc differ diff --git a/src/gym/envs/pilesos/pilesos.py b/src/gym/envs/pilesos/pilesos.py new file mode 100644 index 0000000..06dd124 --- /dev/null +++ b/src/gym/envs/pilesos/pilesos.py @@ -0,0 +1,46 @@ +import numpy as np + +from gym import core, spaces +from pilesos.mapstate import MapState + + +class PilesosEnv(core.Env): + metadata = {} + + IS_HOME = True + DIRT = False + ACTIONS = ['F', 'R', 'L', 'S', 'T'] + + state: MapState + + def __init__(self): + self.viewer = None + low = np.array([0, 0, 0]) + high = np.array([1, 1, 1]) + self.observation_space = spaces.Box(low=low, high=high, dtype=int) + self.action_space = spaces.Discrete(5) + + self.state = MapState.default() + self.current_obs = self.state.get_current_obs() + + def reset(self): + self.state = MapState.default() + return self._get_ob() + + def step(self, action): + action_char = self.ACTIONS[action] + + self.current_obs = self.state.act(action_char) + + has_finished = action_char == 'T' + + return self._get_ob(), self.state.last_reward, has_finished, {} + + def render(self, mode='human'): + raise NotImplementedError() + + def close(self): + pass + + def _get_ob(self): + return np.array([int(self.current_obs.has_bumped), int(self.current_obs.has_dirt), int(self.current_obs.is_home)]) diff --git a/src/gym/envs/registration.py b/src/gym/envs/registration.py new file mode 100644 index 0000000..655a7d7 --- /dev/null +++ b/src/gym/envs/registration.py @@ -0,0 +1,191 @@ +import re +from gym import error, logger + +# This format is true today, but it's *not* an official spec. +# [username/](env-name)-v(version) env-name is group 1, version is group 2 +# +# 2016-10-31: We're experimentally expanding the environment ID format +# to include an optional username. +env_id_re = re.compile(r'^(?:[\w:-]+\/)?([\w:.-]+)-v(\d+)$') + +def load(name): + import pkg_resources # takes ~400ms to load, so we import it lazily + entry_point = pkg_resources.EntryPoint.parse('x={}'.format(name)) + result = entry_point.load(False) + return result + +class EnvSpec(object): + """A specification for a particular instance of the environment. Used + to register the parameters for official evaluations. + + Args: + id (str): The official environment ID + entry_point (Optional[str]): The Python entrypoint of the environment class (e.g. module.name:Class) + trials (int): The number of trials to average reward over + reward_threshold (Optional[int]): The reward threshold before the task is considered solved + local_only: True iff the environment is to be used only on the local machine (e.g. debugging envs) + kwargs (dict): The kwargs to pass to the environment class + nondeterministic (bool): Whether this environment is non-deterministic even after seeding + tags (dict[str:any]): A set of arbitrary key-value tags on this environment, including simple property=True tags + + Attributes: + id (str): The official environment ID + trials (int): The number of trials run in official evaluation + """ + + def __init__(self, id, entry_point=None, trials=100, reward_threshold=None, local_only=False, kwargs=None, nondeterministic=False, tags=None, max_episode_steps=None, max_episode_seconds=None, timestep_limit=None): + self.id = id + # Evaluation parameters + self.trials = trials + self.reward_threshold = reward_threshold + # Environment properties + self.nondeterministic = nondeterministic + + if tags is None: + tags = {} + self.tags = tags + + # BACKWARDS COMPAT 2017/1/18 + if tags.get('wrapper_config.TimeLimit.max_episode_steps'): + max_episode_steps = tags.get('wrapper_config.TimeLimit.max_episode_steps') + # TODO: Add the following deprecation warning after 2017/02/18 + # warnings.warn("DEPRECATION WARNING wrapper_config.TimeLimit has been deprecated. Replace any calls to `register(tags={'wrapper_config.TimeLimit.max_episode_steps': 200)}` with `register(max_episode_steps=200)`. This change was made 2017/1/31 and is included in gym version 0.8.0. If you are getting many of these warnings, you may need to update universe past version 0.21.3") + + tags['wrapper_config.TimeLimit.max_episode_steps'] = max_episode_steps + ###### + + # BACKWARDS COMPAT 2017/1/31 + if timestep_limit is not None: + max_episode_steps = timestep_limit + # TODO: Add the following deprecation warning after 2017/03/01 + # warnings.warn("register(timestep_limit={}) is deprecated. Use register(max_episode_steps={}) instead.".format(timestep_limit, timestep_limit)) + ###### + + self.max_episode_steps = max_episode_steps + self.max_episode_seconds = max_episode_seconds + + # We may make some of these other parameters public if they're + # useful. + match = env_id_re.search(id) + if not match: + raise error.Error('Attempted to register malformed environment ID: {}. (Currently all IDs must be of the form {}.)'.format(id, env_id_re.pattern)) + self._env_name = match.group(1) + self._entry_point = entry_point + self._local_only = local_only + self._kwargs = {} if kwargs is None else kwargs + + def make(self): + """Instantiates an instance of the environment with appropriate kwargs""" + if self._entry_point is None: + raise error.Error('Attempting to make deprecated env {}. (HINT: is there a newer registered version of this env?)'.format(self.id)) + + elif callable(self._entry_point): + env = self._entry_point(**self._kwargs) + else: + cls = load(self._entry_point) + env = cls(**self._kwargs) + + # Make the enviroment aware of which spec it came from. + env.unwrapped.spec = self + + return env + + def __repr__(self): + return "EnvSpec({})".format(self.id) + + @property + def timestep_limit(self): + return self.max_episode_steps + + @timestep_limit.setter + def timestep_limit(self, value): + self.max_episode_steps = value + + +class EnvRegistry(object): + """Register an env by ID. IDs remain stable over time and are + guaranteed to resolve to the same environment dynamics (or be + desupported). The goal is that results on a particular environment + should always be comparable, and not depend on the version of the + code that was running. + """ + + def __init__(self): + self.env_specs = {} + + def make(self, id): + logger.info('Making new env: %s', id) + spec = self.spec(id) + env = spec.make() + # We used to have people override _reset/_step rather than + # reset/step. Set _gym_disable_underscore_compat = True on + # your environment if you use these methods and don't want + # compatibility code to be invoked. + if hasattr(env, "_reset") and hasattr(env, "_step") and not getattr(env, "_gym_disable_underscore_compat", False): + patch_deprecated_methods(env) + if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'): + from gym.wrappers.time_limit import TimeLimit + env = TimeLimit(env, + max_episode_steps=env.spec.max_episode_steps, + max_episode_seconds=env.spec.max_episode_seconds) + return env + + + def all(self): + return self.env_specs.values() + + def spec(self, id): + match = env_id_re.search(id) + if not match: + raise error.Error('Attempted to look up malformed environment ID: {}. (Currently all IDs must be of the form {}.)'.format(id.encode('utf-8'), env_id_re.pattern)) + + try: + return self.env_specs[id] + except KeyError: + # Parse the env name and check to see if it matches the non-version + # part of a valid env (could also check the exact number here) + env_name = match.group(1) + matching_envs = [valid_env_name for valid_env_name, valid_env_spec in self.env_specs.items() + if env_name == valid_env_spec._env_name] + if matching_envs: + raise error.DeprecatedEnv('Env {} not found (valid versions include {})'.format(id, matching_envs)) + else: + raise error.UnregisteredEnv('No registered env with id: {}'.format(id)) + + def register(self, id, **kwargs): + if id in self.env_specs: + raise error.Error('Cannot re-register id: {}'.format(id)) + self.env_specs[id] = EnvSpec(id, **kwargs) + +# Have a global registry +registry = EnvRegistry() + +def register(id, **kwargs): + return registry.register(id, **kwargs) + +def make(id): + return registry.make(id) + +def spec(id): + return registry.spec(id) + +warn_once = True + +def patch_deprecated_methods(env): + """ + Methods renamed from '_method' to 'method', render() no longer has 'close' parameter, close is a separate method. + For backward compatibility, this makes it possible to work with unmodified environments. + """ + global warn_once + if warn_once: + logger.warn("Environment '%s' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior." % str(type(env))) + warn_once = False + env.reset = env._reset + env.step = env._step + env.seed = env._seed + def render(mode): + return env._render(mode, close=False) + def close(): + env._render("human", close=True) + env.render = render + env.close = close diff --git a/src/gym/envs/robotics/README.md b/src/gym/envs/robotics/README.md new file mode 100644 index 0000000..5dbbfda --- /dev/null +++ b/src/gym/envs/robotics/README.md @@ -0,0 +1,54 @@ +# Robotics environments + +Details and documentation on these robotics environments are available in our [blog post](https://blog.openai.com/ingredients-for-robotics-research/), the accompanying [technical report](https://arxiv.org/abs/1802.09464), and the [Gym website](https://gym.openai.com/envs/#robotics). + +If you use these environments, please cite the following paper: + +``` +@misc{1802.09464, + Author = {Matthias Plappert and Marcin Andrychowicz and Alex Ray and Bob McGrew and Bowen Baker and Glenn Powell and Jonas Schneider and Josh Tobin and Maciek Chociej and Peter Welinder and Vikash Kumar and Wojciech Zaremba}, + Title = {Multi-Goal Reinforcement Learning: Challenging Robotics Environments and Request for Research}, + Year = {2018}, + Eprint = {arXiv:1802.09464}, +} +``` + +## Fetch environments + + +[FetchReach-v0](https://gym.openai.com/envs/FetchReach-v0/): Fetch has to move its end-effector to the desired goal position. + + + + +[FetchSlide-v0](https://gym.openai.com/envs/FetchSlide-v0/): Fetch has to hit a puck across a long table such that it slides and comes to rest on the desired goal. + + + + +[FetchPush-v0](https://gym.openai.com/envs/FetchPush-v0/): Fetch has to move a box by pushing it until it reaches a desired goal position. + + + + +[FetchPickAndPlace-v0](https://gym.openai.com/envs/FetchPickAndPlace-v0/): Fetch has to pick up a box from a table using its gripper and move it to a desired goal above the table. + +## Shadow Dexterous Hand environments + + +[HandReach-v0](https://gym.openai.com/envs/HandReach-v0/): ShadowHand has to reach with its thumb and a selected finger until they meet at a desired goal position above the palm. + + + + +[HandManipulateBlock-v0](https://gym.openai.com/envs/HandManipulateBlock-v0/): ShadowHand has to manipulate a block until it achieves a desired goal position and rotation. + + + + +[HandManipulateEgg-v0](https://gym.openai.com/envs/HandManipulateEgg-v0/): ShadowHand has to manipulate an egg until it achieves a desired goal position and rotation. + + + + +[HandManipulatePen-v0](https://gym.openai.com/envs/HandManipulatePen-v0/): ShadowHand has to manipulate a pen until it achieves a desired goal position and rotation. diff --git a/src/gym/envs/robotics/__init__.py b/src/gym/envs/robotics/__init__.py new file mode 100644 index 0000000..ee17769 --- /dev/null +++ b/src/gym/envs/robotics/__init__.py @@ -0,0 +1,10 @@ +from gym.envs.robotics.fetch_env import FetchEnv +from gym.envs.robotics.fetch.slide import FetchSlideEnv +from gym.envs.robotics.fetch.pick_and_place import FetchPickAndPlaceEnv +from gym.envs.robotics.fetch.push import FetchPushEnv +from gym.envs.robotics.fetch.reach import FetchReachEnv + +from gym.envs.robotics.hand.reach import HandReachEnv +from gym.envs.robotics.hand.manipulate import HandBlockEnv +from gym.envs.robotics.hand.manipulate import HandEggEnv +from gym.envs.robotics.hand.manipulate import HandPenEnv diff --git a/src/gym/envs/robotics/assets/LICENSE.md b/src/gym/envs/robotics/assets/LICENSE.md new file mode 100644 index 0000000..22ce901 --- /dev/null +++ b/src/gym/envs/robotics/assets/LICENSE.md @@ -0,0 +1,222 @@ +# Fetch Robotics +The model of the [Fetch](http://fetchrobotics.com/platforms-research-development/) is based on [models provided by Fetch](https://github.com/fetchrobotics/fetch_ros/tree/indigo-devel/fetch_description). It was adapted and refined by OpenAI. + +# ShadowHand +The model of the [ShadowHand](https://www.shadowrobot.com/products/dexterous-hand/) is based on [models provided by ShadowRobot](https://github.com/shadow-robot/sr_common/tree/kinetic-devel/sr_description/hand/model), and on code used under the following license: + +(C) Vikash Kumar, CSE, UW. Licensed under Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Additional license notices: + + Sources : 1) Manipulator and Manipulation in High Dimensional Spaces. Vikash Kumar, Ph.D. Thesis, CSE, Univ. of Washington. 2016. + + Mujoco :: Advanced physics simulation engine + Source : www.roboti.us + Version : 1.40 + Released : 17Jan'17 + + Author :: Vikash Kumar + Contacts : vikash@openai.com + Last edits : 3Apr'17 diff --git a/src/gym/envs/robotics/assets/fetch/pick_and_place.xml b/src/gym/envs/robotics/assets/fetch/pick_and_place.xml new file mode 100644 index 0000000..337032a --- /dev/null +++ b/src/gym/envs/robotics/assets/fetch/pick_and_place.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/fetch/push.xml b/src/gym/envs/robotics/assets/fetch/push.xml new file mode 100644 index 0000000..8e12db2 --- /dev/null +++ b/src/gym/envs/robotics/assets/fetch/push.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/gym/envs/robotics/assets/fetch/reach.xml b/src/gym/envs/robotics/assets/fetch/reach.xml new file mode 100644 index 0000000..c73d624 --- /dev/null +++ b/src/gym/envs/robotics/assets/fetch/reach.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/fetch/robot.xml b/src/gym/envs/robotics/assets/fetch/robot.xml new file mode 100644 index 0000000..9ee7723 --- /dev/null +++ b/src/gym/envs/robotics/assets/fetch/robot.xml @@ -0,0 +1,123 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/fetch/shared.xml b/src/gym/envs/robotics/assets/fetch/shared.xml new file mode 100644 index 0000000..5d61fef --- /dev/null +++ b/src/gym/envs/robotics/assets/fetch/shared.xml @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/fetch/slide.xml b/src/gym/envs/robotics/assets/fetch/slide.xml new file mode 100644 index 0000000..efbfb51 --- /dev/null +++ b/src/gym/envs/robotics/assets/fetch/slide.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/manipulate_block.xml b/src/gym/envs/robotics/assets/hand/manipulate_block.xml new file mode 100644 index 0000000..83a6517 --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/manipulate_block.xml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/manipulate_egg.xml b/src/gym/envs/robotics/assets/hand/manipulate_egg.xml new file mode 100644 index 0000000..46d1dbb --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/manipulate_egg.xml @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/manipulate_pen.xml b/src/gym/envs/robotics/assets/hand/manipulate_pen.xml new file mode 100644 index 0000000..20a6fb5 --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/manipulate_pen.xml @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/reach.xml b/src/gym/envs/robotics/assets/hand/reach.xml new file mode 100644 index 0000000..71f6dfe --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/reach.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/robot.xml b/src/gym/envs/robotics/assets/hand/robot.xml new file mode 100644 index 0000000..dbb9e43 --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/robot.xml @@ -0,0 +1,160 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/shared.xml b/src/gym/envs/robotics/assets/hand/shared.xml new file mode 100644 index 0000000..f27f265 --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/shared.xml @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/hand/shared_asset.xml b/src/gym/envs/robotics/assets/hand/shared_asset.xml new file mode 100644 index 0000000..ec9a0b0 --- /dev/null +++ b/src/gym/envs/robotics/assets/hand/shared_asset.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gym/envs/robotics/assets/stls/.get b/src/gym/envs/robotics/assets/stls/.get new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/envs/robotics/assets/stls/fetch/base_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/base_link_collision.stl new file mode 100644 index 0000000..1ef459f Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/base_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/bellows_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/bellows_link_collision.stl new file mode 100644 index 0000000..a7e5ab7 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/bellows_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/elbow_flex_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/elbow_flex_link_collision.stl new file mode 100644 index 0000000..b0eea07 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/elbow_flex_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/estop_link.stl b/src/gym/envs/robotics/assets/stls/fetch/estop_link.stl new file mode 100644 index 0000000..f6d1c72 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/estop_link.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/forearm_roll_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/forearm_roll_link_collision.stl new file mode 100644 index 0000000..fe468c5 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/forearm_roll_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/gripper_link.stl b/src/gym/envs/robotics/assets/stls/fetch/gripper_link.stl new file mode 100644 index 0000000..8a14874 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/gripper_link.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/head_pan_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/head_pan_link_collision.stl new file mode 100644 index 0000000..c77b5b1 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/head_pan_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/head_tilt_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/head_tilt_link_collision.stl new file mode 100644 index 0000000..53c2ddc Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/head_tilt_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/l_wheel_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/l_wheel_link_collision.stl new file mode 100644 index 0000000..5c17524 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/l_wheel_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/laser_link.stl b/src/gym/envs/robotics/assets/stls/fetch/laser_link.stl new file mode 100644 index 0000000..fa4882f Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/laser_link.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/r_wheel_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/r_wheel_link_collision.stl new file mode 100644 index 0000000..3742b24 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/r_wheel_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/shoulder_lift_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/shoulder_lift_link_collision.stl new file mode 100644 index 0000000..c9aff0d Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/shoulder_lift_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/shoulder_pan_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/shoulder_pan_link_collision.stl new file mode 100644 index 0000000..ac17a94 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/shoulder_pan_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/torso_fixed_link.stl b/src/gym/envs/robotics/assets/stls/fetch/torso_fixed_link.stl new file mode 100644 index 0000000..7cf7fc1 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/torso_fixed_link.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/torso_lift_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/torso_lift_link_collision.stl new file mode 100644 index 0000000..4ce5fcf Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/torso_lift_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/upperarm_roll_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/upperarm_roll_link_collision.stl new file mode 100644 index 0000000..1207932 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/upperarm_roll_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/wrist_flex_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/wrist_flex_link_collision.stl new file mode 100644 index 0000000..3215d2e Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/wrist_flex_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/fetch/wrist_roll_link_collision.stl b/src/gym/envs/robotics/assets/stls/fetch/wrist_roll_link_collision.stl new file mode 100644 index 0000000..742bdd9 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/fetch/wrist_roll_link_collision.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/F1.stl b/src/gym/envs/robotics/assets/stls/hand/F1.stl new file mode 100644 index 0000000..515d3c9 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/F1.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/F2.stl b/src/gym/envs/robotics/assets/stls/hand/F2.stl new file mode 100644 index 0000000..7bc5e20 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/F2.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/F3.stl b/src/gym/envs/robotics/assets/stls/hand/F3.stl new file mode 100644 index 0000000..223f06f Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/F3.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/TH1_z.stl b/src/gym/envs/robotics/assets/stls/hand/TH1_z.stl new file mode 100644 index 0000000..400ee2d Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/TH1_z.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/TH2_z.stl b/src/gym/envs/robotics/assets/stls/hand/TH2_z.stl new file mode 100644 index 0000000..5ace838 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/TH2_z.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/TH3_z.stl b/src/gym/envs/robotics/assets/stls/hand/TH3_z.stl new file mode 100644 index 0000000..23485ab Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/TH3_z.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/forearm_electric.stl b/src/gym/envs/robotics/assets/stls/hand/forearm_electric.stl new file mode 100644 index 0000000..80f6f3d Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/forearm_electric.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/forearm_electric_cvx.stl b/src/gym/envs/robotics/assets/stls/hand/forearm_electric_cvx.stl new file mode 100644 index 0000000..3c30f57 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/forearm_electric_cvx.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/knuckle.stl b/src/gym/envs/robotics/assets/stls/hand/knuckle.stl new file mode 100644 index 0000000..4faedd7 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/knuckle.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/lfmetacarpal.stl b/src/gym/envs/robotics/assets/stls/hand/lfmetacarpal.stl new file mode 100644 index 0000000..535cf4d Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/lfmetacarpal.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/palm.stl b/src/gym/envs/robotics/assets/stls/hand/palm.stl new file mode 100644 index 0000000..65e47eb Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/palm.stl differ diff --git a/src/gym/envs/robotics/assets/stls/hand/wrist.stl b/src/gym/envs/robotics/assets/stls/hand/wrist.stl new file mode 100644 index 0000000..420d5f9 Binary files /dev/null and b/src/gym/envs/robotics/assets/stls/hand/wrist.stl differ diff --git a/src/gym/envs/robotics/assets/textures/block.png b/src/gym/envs/robotics/assets/textures/block.png new file mode 100644 index 0000000..0243b8f Binary files /dev/null and b/src/gym/envs/robotics/assets/textures/block.png differ diff --git a/src/gym/envs/robotics/assets/textures/block_hidden.png b/src/gym/envs/robotics/assets/textures/block_hidden.png new file mode 100644 index 0000000..e08b861 Binary files /dev/null and b/src/gym/envs/robotics/assets/textures/block_hidden.png differ diff --git a/src/gym/envs/robotics/fetch/__init__.py b/src/gym/envs/robotics/fetch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/envs/robotics/fetch/pick_and_place.py b/src/gym/envs/robotics/fetch/pick_and_place.py new file mode 100644 index 0000000..9b4d049 --- /dev/null +++ b/src/gym/envs/robotics/fetch/pick_and_place.py @@ -0,0 +1,18 @@ +from gym import utils +from gym.envs.robotics import fetch_env + + +class FetchPickAndPlaceEnv(fetch_env.FetchEnv, utils.EzPickle): + def __init__(self, reward_type='sparse'): + initial_qpos = { + 'robot0:slide0': 0.405, + 'robot0:slide1': 0.48, + 'robot0:slide2': 0.0, + 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], + } + fetch_env.FetchEnv.__init__( + self, 'fetch/pick_and_place.xml', has_object=True, block_gripper=False, n_substeps=20, + gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0, + obj_range=0.15, target_range=0.15, distance_threshold=0.05, + initial_qpos=initial_qpos, reward_type=reward_type) + utils.EzPickle.__init__(self) diff --git a/src/gym/envs/robotics/fetch/push.py b/src/gym/envs/robotics/fetch/push.py new file mode 100644 index 0000000..a3c1963 --- /dev/null +++ b/src/gym/envs/robotics/fetch/push.py @@ -0,0 +1,18 @@ +from gym import utils +from gym.envs.robotics import fetch_env + + +class FetchPushEnv(fetch_env.FetchEnv, utils.EzPickle): + def __init__(self, reward_type='sparse'): + initial_qpos = { + 'robot0:slide0': 0.405, + 'robot0:slide1': 0.48, + 'robot0:slide2': 0.0, + 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], + } + fetch_env.FetchEnv.__init__( + self, 'fetch/push.xml', has_object=True, block_gripper=True, n_substeps=20, + gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0, + obj_range=0.15, target_range=0.15, distance_threshold=0.05, + initial_qpos=initial_qpos, reward_type=reward_type) + utils.EzPickle.__init__(self) diff --git a/src/gym/envs/robotics/fetch/reach.py b/src/gym/envs/robotics/fetch/reach.py new file mode 100644 index 0000000..7504e4d --- /dev/null +++ b/src/gym/envs/robotics/fetch/reach.py @@ -0,0 +1,17 @@ +from gym import utils +from gym.envs.robotics import fetch_env + + +class FetchReachEnv(fetch_env.FetchEnv, utils.EzPickle): + def __init__(self, reward_type='sparse'): + initial_qpos = { + 'robot0:slide0': 0.4049, + 'robot0:slide1': 0.48, + 'robot0:slide2': 0.0, + } + fetch_env.FetchEnv.__init__( + self, 'fetch/reach.xml', has_object=False, block_gripper=True, n_substeps=20, + gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0, + obj_range=0.15, target_range=0.15, distance_threshold=0.05, + initial_qpos=initial_qpos, reward_type=reward_type) + utils.EzPickle.__init__(self) diff --git a/src/gym/envs/robotics/fetch/slide.py b/src/gym/envs/robotics/fetch/slide.py new file mode 100644 index 0000000..b8a02a9 --- /dev/null +++ b/src/gym/envs/robotics/fetch/slide.py @@ -0,0 +1,20 @@ +import numpy as np + +from gym import utils +from gym.envs.robotics import fetch_env + + +class FetchSlideEnv(fetch_env.FetchEnv, utils.EzPickle): + def __init__(self, reward_type='sparse'): + initial_qpos = { + 'robot0:slide0': 0.05, + 'robot0:slide1': 0.48, + 'robot0:slide2': 0.0, + 'object0:joint': [1.7, 1.1, 0.4, 1., 0., 0., 0.], + } + fetch_env.FetchEnv.__init__( + self, 'fetch/slide.xml', has_object=True, block_gripper=True, n_substeps=20, + gripper_extra_height=-0.02, target_in_the_air=False, target_offset=np.array([0.4, 0.0, 0.0]), + obj_range=0.1, target_range=0.3, distance_threshold=0.05, + initial_qpos=initial_qpos, reward_type=reward_type) + utils.EzPickle.__init__(self) diff --git a/src/gym/envs/robotics/fetch_env.py b/src/gym/envs/robotics/fetch_env.py new file mode 100644 index 0000000..4916c4b --- /dev/null +++ b/src/gym/envs/robotics/fetch_env.py @@ -0,0 +1,187 @@ +import numpy as np + +from gym.envs.robotics import rotations, robot_env, utils + + +def goal_distance(goal_a, goal_b): + assert goal_a.shape == goal_b.shape + return np.linalg.norm(goal_a - goal_b, axis=-1) + + +class FetchEnv(robot_env.RobotEnv): + """Superclass for all Fetch environments. + """ + + def __init__( + self, model_path, n_substeps, gripper_extra_height, block_gripper, + has_object, target_in_the_air, target_offset, obj_range, target_range, + distance_threshold, initial_qpos, reward_type, + ): + """Initializes a new Fetch environment. + + Args: + model_path (string): path to the environments XML file + n_substeps (int): number of substeps the simulation runs on every call to step + gripper_extra_height (float): additional height above the table when positioning the gripper + block_gripper (boolean): whether or not the gripper is blocked (i.e. not movable) or not + has_object (boolean): whether or not the environment has an object + target_in_the_air (boolean): whether or not the target should be in the air above the table or on the table surface + target_offset (float or array with 3 elements): offset of the target + obj_range (float): range of a uniform distribution for sampling initial object positions + target_range (float): range of a uniform distribution for sampling a target + distance_threshold (float): the threshold after which a goal is considered achieved + initial_qpos (dict): a dictionary of joint names and values that define the initial configuration + reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense + """ + self.gripper_extra_height = gripper_extra_height + self.block_gripper = block_gripper + self.has_object = has_object + self.target_in_the_air = target_in_the_air + self.target_offset = target_offset + self.obj_range = obj_range + self.target_range = target_range + self.distance_threshold = distance_threshold + self.reward_type = reward_type + + super(FetchEnv, self).__init__( + model_path=model_path, n_substeps=n_substeps, n_actions=4, + initial_qpos=initial_qpos) + + # GoalEnv methods + # ---------------------------- + + def compute_reward(self, achieved_goal, goal, info): + # Compute distance between goal and the achieved goal. + d = goal_distance(achieved_goal, goal) + if self.reward_type == 'sparse': + return -(d > self.distance_threshold).astype(np.float32) + else: + return -d + + # RobotEnv methods + # ---------------------------- + + def _step_callback(self): + if self.block_gripper: + self.sim.data.set_joint_qpos('robot0:l_gripper_finger_joint', 0.) + self.sim.data.set_joint_qpos('robot0:r_gripper_finger_joint', 0.) + self.sim.forward() + + def _set_action(self, action): + assert action.shape == (4,) + action = action.copy() # ensure that we don't change the action outside of this scope + pos_ctrl, gripper_ctrl = action[:3], action[3] + + pos_ctrl *= 0.05 # limit maximum change in position + rot_ctrl = [1., 0., 1., 0.] # fixed rotation of the end effector, expressed as a quaternion + gripper_ctrl = np.array([gripper_ctrl, gripper_ctrl]) + assert gripper_ctrl.shape == (2,) + if self.block_gripper: + gripper_ctrl = np.zeros_like(gripper_ctrl) + action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl]) + + # Apply action to simulation. + utils.ctrl_set_action(self.sim, action) + utils.mocap_set_action(self.sim, action) + + def _get_obs(self): + # positions + grip_pos = self.sim.data.get_site_xpos('robot0:grip') + dt = self.sim.nsubsteps * self.sim.model.opt.timestep + grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt + robot_qpos, robot_qvel = utils.robot_get_obs(self.sim) + if self.has_object: + object_pos = self.sim.data.get_site_xpos('object0') + # rotations + object_rot = rotations.mat2euler(self.sim.data.get_site_xmat('object0')) + # velocities + object_velp = self.sim.data.get_site_xvelp('object0') * dt + object_velr = self.sim.data.get_site_xvelr('object0') * dt + # gripper state + object_rel_pos = object_pos - grip_pos + object_velp -= grip_velp + else: + object_pos = object_rot = object_velp = object_velr = object_rel_pos = np.zeros(0) + gripper_state = robot_qpos[-2:] + gripper_vel = robot_qvel[-2:] * dt # change to a scalar if the gripper is made symmetric + + if not self.has_object: + achieved_goal = grip_pos.copy() + else: + achieved_goal = np.squeeze(object_pos.copy()) + obs = np.concatenate([ + grip_pos, object_pos.ravel(), object_rel_pos.ravel(), gripper_state, object_rot.ravel(), + object_velp.ravel(), object_velr.ravel(), grip_velp, gripper_vel, + ]) + + return { + 'observation': obs.copy(), + 'achieved_goal': achieved_goal.copy(), + 'desired_goal': self.goal.copy(), + } + + def _viewer_setup(self): + body_id = self.sim.model.body_name2id('robot0:gripper_link') + lookat = self.sim.data.body_xpos[body_id] + for idx, value in enumerate(lookat): + self.viewer.cam.lookat[idx] = value + self.viewer.cam.distance = 2.5 + self.viewer.cam.azimuth = 132. + self.viewer.cam.elevation = -14. + + def _render_callback(self): + # Visualize target. + sites_offset = (self.sim.data.site_xpos - self.sim.model.site_pos).copy() + site_id = self.sim.model.site_name2id('target0') + self.sim.model.site_pos[site_id] = self.goal - sites_offset[0] + self.sim.forward() + + def _reset_sim(self): + self.sim.set_state(self.initial_state) + + # Randomize start position of object. + if self.has_object: + object_xpos = self.initial_gripper_xpos[:2] + while np.linalg.norm(object_xpos - self.initial_gripper_xpos[:2]) < 0.1: + object_xpos = self.initial_gripper_xpos[:2] + self.np_random.uniform(-self.obj_range, self.obj_range, size=2) + object_qpos = self.sim.data.get_joint_qpos('object0:joint') + assert object_qpos.shape == (7,) + object_qpos[:2] = object_xpos + self.sim.data.set_joint_qpos('object0:joint', object_qpos) + + self.sim.forward() + return True + + def _sample_goal(self): + if self.has_object: + goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-self.target_range, self.target_range, size=3) + goal += self.target_offset + goal[2] = self.height_offset + if self.target_in_the_air and self.np_random.uniform() < 0.5: + goal[2] += self.np_random.uniform(0, 0.45) + else: + goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-0.15, 0.15, size=3) + return goal.copy() + + def _is_success(self, achieved_goal, desired_goal): + d = goal_distance(achieved_goal, desired_goal) + return (d < self.distance_threshold).astype(np.float32) + + def _env_setup(self, initial_qpos): + for name, value in initial_qpos.items(): + self.sim.data.set_joint_qpos(name, value) + utils.reset_mocap_welds(self.sim) + self.sim.forward() + + # Move end effector into position. + gripper_target = np.array([-0.498, 0.005, -0.431 + self.gripper_extra_height]) + self.sim.data.get_site_xpos('robot0:grip') + gripper_rotation = np.array([1., 0., 1., 0.]) + self.sim.data.set_mocap_pos('robot0:mocap', gripper_target) + self.sim.data.set_mocap_quat('robot0:mocap', gripper_rotation) + for _ in range(10): + self.sim.step() + + # Extract information for sampling goals. + self.initial_gripper_xpos = self.sim.data.get_site_xpos('robot0:grip').copy() + if self.has_object: + self.height_offset = self.sim.data.get_site_xpos('object0')[2] diff --git a/src/gym/envs/robotics/hand/__init__.py b/src/gym/envs/robotics/hand/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/envs/robotics/hand/manipulate.py b/src/gym/envs/robotics/hand/manipulate.py new file mode 100644 index 0000000..e488d49 --- /dev/null +++ b/src/gym/envs/robotics/hand/manipulate.py @@ -0,0 +1,292 @@ +import numpy as np + +from gym import utils, error +from gym.envs.robotics import rotations, hand_env +from gym.envs.robotics.utils import robot_get_obs + +try: + import mujoco_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) + + +def quat_from_angle_and_axis(angle, axis): + assert axis.shape == (3,) + axis /= np.linalg.norm(axis) + quat = np.concatenate([[np.cos(angle / 2.)], np.sin(angle / 2.) * axis]) + quat /= np.linalg.norm(quat) + return quat + + +class ManipulateEnv(hand_env.HandEnv, utils.EzPickle): + def __init__( + self, model_path, target_position, target_rotation, + target_position_range, reward_type, initial_qpos={}, + randomize_initial_position=True, randomize_initial_rotation=True, + distance_threshold=0.01, rotation_threshold=0.1, n_substeps=20, relative_control=False, + ignore_z_target_rotation=False, + ): + """Initializes a new Hand manipulation environment. + + Args: + model_path (string): path to the environments XML file + target_position (string): the type of target position: + - ignore: target position is fully ignored, i.e. the object can be positioned arbitrarily + - fixed: target position is set to the initial position of the object + - random: target position is fully randomized according to target_position_range + target_rotation (string): the type of target rotation: + - ignore: target rotation is fully ignored, i.e. the object can be rotated arbitrarily + - fixed: target rotation is set to the initial rotation of the object + - xyz: fully randomized target rotation around the X, Y and Z axis + - z: fully randomized target rotation around the Z axis + - parallel: fully randomized target rotation around Z and axis-aligned rotation around X, Y + ignore_z_target_rotation (boolean): whether or not the Z axis of the target rotation is ignored + target_position_range (np.array of shape (3, 2)): range of the target_position randomization + reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense + initial_qpos (dict): a dictionary of joint names and values that define the initial configuration + randomize_initial_position (boolean): whether or not to randomize the initial position of the object + randomize_initial_rotation (boolean): whether or not to randomize the initial rotation of the object + distance_threshold (float, in meters): the threshold after which the position of a goal is considered achieved + rotation_threshold (float, in radians): the threshold after which the rotation of a goal is considered achieved + n_substeps (int): number of substeps the simulation runs on every call to step + relative_control (boolean): whether or not the hand is actuated in absolute joint positions or relative to the current state + """ + self.target_position = target_position + self.target_rotation = target_rotation + self.target_position_range = target_position_range + self.parallel_quats = [rotations.euler2quat(r) for r in rotations.get_parallel_rotations()] + self.randomize_initial_rotation = randomize_initial_rotation + self.randomize_initial_position = randomize_initial_position + self.distance_threshold = distance_threshold + self.rotation_threshold = rotation_threshold + self.reward_type = reward_type + self.ignore_z_target_rotation = ignore_z_target_rotation + + assert self.target_position in ['ignore', 'fixed', 'random'] + assert self.target_rotation in ['ignore', 'fixed', 'xyz', 'z', 'parallel'] + + hand_env.HandEnv.__init__( + self, model_path, n_substeps=n_substeps, initial_qpos=initial_qpos, + relative_control=relative_control) + utils.EzPickle.__init__(self) + + def _get_achieved_goal(self): + # Object position and rotation. + object_qpos = self.sim.data.get_joint_qpos('object:joint') + assert object_qpos.shape == (7,) + return object_qpos + + def _goal_distance(self, goal_a, goal_b): + assert goal_a.shape == goal_b.shape + assert goal_a.shape[-1] == 7 + + d_pos = np.zeros_like(goal_a[..., 0]) + d_rot = np.zeros_like(goal_b[..., 0]) + if self.target_position != 'ignore': + delta_pos = goal_a[..., :3] - goal_b[..., :3] + d_pos = np.linalg.norm(delta_pos, axis=-1) + + if self.target_rotation != 'ignore': + quat_a, quat_b = goal_a[..., 3:], goal_b[..., 3:] + + if self.ignore_z_target_rotation: + # Special case: We want to ignore the Z component of the rotation. + # This code here assumes Euler angles with xyz convention. We first transform + # to euler, then set the Z component to be equal between the two, and finally + # transform back into quaternions. + euler_a = rotations.quat2euler(quat_a) + euler_b = rotations.quat2euler(quat_b) + euler_a[2] = euler_b[2] + quat_a = rotations.euler2quat(euler_a) + + # Subtract quaternions and extract angle between them. + quat_diff = rotations.quat_mul(quat_a, rotations.quat_conjugate(quat_b)) + angle_diff = 2 * np.arccos(np.clip(quat_diff[..., 0], -1., 1.)) + d_rot = angle_diff + assert d_pos.shape == d_rot.shape + return d_pos, d_rot + + # GoalEnv methods + # ---------------------------- + + def compute_reward(self, achieved_goal, goal, info): + if self.reward_type == 'sparse': + success = self._is_success(achieved_goal, goal).astype(np.float32) + return (success - 1.) + else: + d_pos, d_rot = self._goal_distance(achieved_goal, goal) + # We weigh the difference in position to avoid that `d_pos` (in meters) is completely + # dominated by `d_rot` (in radians). + return -(10. * d_pos + d_rot) + + # RobotEnv methods + # ---------------------------- + + def _is_success(self, achieved_goal, desired_goal): + d_pos, d_rot = self._goal_distance(achieved_goal, desired_goal) + achieved_pos = (d_pos < self.distance_threshold).astype(np.float32) + achieved_rot = (d_rot < self.rotation_threshold).astype(np.float32) + achieved_both = achieved_pos * achieved_rot + return achieved_both + + def _env_setup(self, initial_qpos): + for name, value in initial_qpos.items(): + self.sim.data.set_joint_qpos(name, value) + self.sim.forward() + + def _reset_sim(self): + self.sim.set_state(self.initial_state) + self.sim.forward() + + initial_qpos = self.sim.data.get_joint_qpos('object:joint').copy() + initial_pos, initial_quat = initial_qpos[:3], initial_qpos[3:] + assert initial_qpos.shape == (7,) + assert initial_pos.shape == (3,) + assert initial_quat.shape == (4,) + initial_qpos = None + + # Randomization initial rotation. + if self.randomize_initial_rotation: + if self.target_rotation == 'z': + angle = self.np_random.uniform(-np.pi, np.pi) + axis = np.array([0., 0., 1.]) + offset_quat = quat_from_angle_and_axis(angle, axis) + initial_quat = rotations.quat_mul(initial_quat, offset_quat) + elif self.target_rotation == 'parallel': + angle = self.np_random.uniform(-np.pi, np.pi) + axis = np.array([0., 0., 1.]) + z_quat = quat_from_angle_and_axis(angle, axis) + parallel_quat = self.parallel_quats[self.np_random.randint(len(self.parallel_quats))] + offset_quat = rotations.quat_mul(z_quat, parallel_quat) + initial_quat = rotations.quat_mul(initial_quat, offset_quat) + elif self.target_rotation in ['xyz', 'ignore']: + angle = self.np_random.uniform(-np.pi, np.pi) + axis = self.np_random.uniform(-1., 1., size=3) + offset_quat = quat_from_angle_and_axis(angle, axis) + initial_quat = rotations.quat_mul(initial_quat, offset_quat) + elif self.target_rotation == 'fixed': + pass + else: + raise error.Error('Unknown target_rotation option "{}".'.format(self.target_rotation)) + + # Randomize initial position. + if self.randomize_initial_position: + if self.target_position != 'fixed': + initial_pos += self.np_random.normal(size=3, scale=0.005) + + initial_quat /= np.linalg.norm(initial_quat) + initial_qpos = np.concatenate([initial_pos, initial_quat]) + self.sim.data.set_joint_qpos('object:joint', initial_qpos) + + def is_on_palm(): + self.sim.forward() + cube_middle_idx = self.sim.model.site_name2id('object:center') + cube_middle_pos = self.sim.data.site_xpos[cube_middle_idx] + is_on_palm = (cube_middle_pos[2] > 0.04) + return is_on_palm + + # Run the simulation for a bunch of timesteps to let everything settle in. + for _ in range(10): + self._set_action(np.zeros(20)) + try: + self.sim.step() + except mujoco_py.MujocoException: + return False + return is_on_palm() + + def _sample_goal(self): + # Select a goal for the object position. + target_pos = None + if self.target_position == 'random': + assert self.target_position_range.shape == (3, 2) + offset = self.np_random.uniform(self.target_position_range[:, 0], self.target_position_range[:, 1]) + assert offset.shape == (3,) + target_pos = self.sim.data.get_joint_qpos('object:joint')[:3] + offset + elif self.target_position in ['ignore', 'fixed']: + target_pos = self.sim.data.get_joint_qpos('object:joint')[:3] + else: + raise error.Error('Unknown target_position option "{}".'.format(self.target_position)) + assert target_pos is not None + assert target_pos.shape == (3,) + + # Select a goal for the object rotation. + target_quat = None + if self.target_rotation == 'z': + angle = self.np_random.uniform(-np.pi, np.pi) + axis = np.array([0., 0., 1.]) + target_quat = quat_from_angle_and_axis(angle, axis) + elif self.target_rotation == 'parallel': + angle = self.np_random.uniform(-np.pi, np.pi) + axis = np.array([0., 0., 1.]) + target_quat = quat_from_angle_and_axis(angle, axis) + parallel_quat = self.parallel_quats[self.np_random.randint(len(self.parallel_quats))] + target_quat = rotations.quat_mul(target_quat, parallel_quat) + elif self.target_rotation == 'xyz': + angle = self.np_random.uniform(-np.pi, np.pi) + axis = self.np_random.uniform(-1., 1., size=3) + target_quat = quat_from_angle_and_axis(angle, axis) + elif self.target_rotation in ['ignore', 'fixed']: + target_quat = self.sim.data.get_joint_qpos('object:joint') + else: + raise error.Error('Unknown target_rotation option "{}".'.format(self.target_rotation)) + assert target_quat is not None + assert target_quat.shape == (4,) + + target_quat /= np.linalg.norm(target_quat) # normalized quaternion + goal = np.concatenate([target_pos, target_quat]) + return goal + + def _render_callback(self): + # Assign current state to target object but offset a bit so that the actual object + # is not obscured. + goal = self.goal.copy() + assert goal.shape == (7,) + if self.target_position == 'ignore': + # Move the object to the side since we do not care about it's position. + goal[0] += 0.15 + self.sim.data.set_joint_qpos('target:joint', goal) + self.sim.data.set_joint_qvel('target:joint', np.zeros(6)) + + if 'object_hidden' in self.sim.model.geom_names: + hidden_id = self.sim.model.geom_name2id('object_hidden') + self.sim.model.geom_rgba[hidden_id, 3] = 1. + self.sim.forward() + + def _get_obs(self): + robot_qpos, robot_qvel = robot_get_obs(self.sim) + object_qvel = self.sim.data.get_joint_qvel('object:joint') + achieved_goal = self._get_achieved_goal().ravel() # this contains the object position + rotation + observation = np.concatenate([robot_qpos, robot_qvel, object_qvel, achieved_goal]) + return { + 'observation': observation.copy(), + 'achieved_goal': achieved_goal.copy(), + 'desired_goal': self.goal.ravel().copy(), + } + + +class HandBlockEnv(ManipulateEnv): + def __init__(self, target_position='random', target_rotation='xyz', reward_type='sparse'): + super(HandBlockEnv, self).__init__( + model_path='hand/manipulate_block.xml', target_position=target_position, + target_rotation=target_rotation, + target_position_range=np.array([(-0.04, 0.04), (-0.06, 0.02), (0.0, 0.06)]), + reward_type=reward_type) + + +class HandEggEnv(ManipulateEnv): + def __init__(self, target_position='random', target_rotation='xyz', reward_type='sparse'): + super(HandEggEnv, self).__init__( + model_path='hand/manipulate_egg.xml', target_position=target_position, + target_rotation=target_rotation, + target_position_range=np.array([(-0.04, 0.04), (-0.06, 0.02), (0.0, 0.06)]), + reward_type=reward_type) + + +class HandPenEnv(ManipulateEnv): + def __init__(self, target_position='random', target_rotation='xyz', reward_type='sparse'): + super(HandPenEnv, self).__init__( + model_path='hand/manipulate_pen.xml', target_position=target_position, + target_rotation=target_rotation, + target_position_range=np.array([(-0.04, 0.04), (-0.06, 0.02), (0.0, 0.06)]), + randomize_initial_rotation=False, reward_type=reward_type, + ignore_z_target_rotation=True, distance_threshold=0.05) diff --git a/src/gym/envs/robotics/hand/reach.py b/src/gym/envs/robotics/hand/reach.py new file mode 100644 index 0000000..1f6f9f8 --- /dev/null +++ b/src/gym/envs/robotics/hand/reach.py @@ -0,0 +1,144 @@ +import numpy as np + +from gym import utils +from gym.envs.robotics import hand_env +from gym.envs.robotics.utils import robot_get_obs + + +FINGERTIP_SITE_NAMES = [ + 'robot0:S_fftip', + 'robot0:S_mftip', + 'robot0:S_rftip', + 'robot0:S_lftip', + 'robot0:S_thtip', +] + + +DEFAULT_INITIAL_QPOS = { + 'robot0:WRJ1': -0.16514339750464327, + 'robot0:WRJ0': -0.31973286565062153, + 'robot0:FFJ3': 0.14340512546557435, + 'robot0:FFJ2': 0.32028208333591573, + 'robot0:FFJ1': 0.7126053607727917, + 'robot0:FFJ0': 0.6705281001412586, + 'robot0:MFJ3': 0.000246444303701037, + 'robot0:MFJ2': 0.3152655251085491, + 'robot0:MFJ1': 0.7659800313729842, + 'robot0:MFJ0': 0.7323156897425923, + 'robot0:RFJ3': 0.00038520700007378114, + 'robot0:RFJ2': 0.36743546201985233, + 'robot0:RFJ1': 0.7119514095008576, + 'robot0:RFJ0': 0.6699446327514138, + 'robot0:LFJ4': 0.0525442258033891, + 'robot0:LFJ3': -0.13615534724474673, + 'robot0:LFJ2': 0.39872030433433003, + 'robot0:LFJ1': 0.7415570009679252, + 'robot0:LFJ0': 0.704096378652974, + 'robot0:THJ4': 0.003673823825070126, + 'robot0:THJ3': 0.5506291436028695, + 'robot0:THJ2': -0.014515151997119306, + 'robot0:THJ1': -0.0015229223564485414, + 'robot0:THJ0': -0.7894883021600622, +} + + +def goal_distance(goal_a, goal_b): + assert goal_a.shape == goal_b.shape + return np.linalg.norm(goal_a - goal_b, axis=-1) + + +class HandReachEnv(hand_env.HandEnv, utils.EzPickle): + def __init__( + self, distance_threshold=0.01, n_substeps=20, relative_control=False, + initial_qpos=DEFAULT_INITIAL_QPOS, reward_type='sparse', + ): + self.distance_threshold = distance_threshold + self.reward_type = reward_type + + hand_env.HandEnv.__init__( + self, 'hand/reach.xml', n_substeps=n_substeps, initial_qpos=initial_qpos, + relative_control=relative_control) + utils.EzPickle.__init__(self) + + def _get_achieved_goal(self): + goal = [self.sim.data.get_site_xpos(name) for name in FINGERTIP_SITE_NAMES] + return np.array(goal).flatten() + + # GoalEnv methods + # ---------------------------- + + def compute_reward(self, achieved_goal, goal, info): + d = goal_distance(achieved_goal, goal) + if self.reward_type == 'sparse': + return -(d > self.distance_threshold).astype(np.float32) + else: + return -d + + # RobotEnv methods + # ---------------------------- + + def _env_setup(self, initial_qpos): + for name, value in initial_qpos.items(): + self.sim.data.set_joint_qpos(name, value) + self.sim.forward() + + self.initial_goal = self._get_achieved_goal().copy() + self.palm_xpos = self.sim.data.body_xpos[self.sim.model.body_name2id('robot0:palm')].copy() + + def _get_obs(self): + robot_qpos, robot_qvel = robot_get_obs(self.sim) + achieved_goal = self._get_achieved_goal().ravel() + observation = np.concatenate([robot_qpos, robot_qvel, achieved_goal]) + return { + 'observation': observation.copy(), + 'achieved_goal': achieved_goal.copy(), + 'desired_goal': self.goal.copy(), + } + + def _sample_goal(self): + thumb_name = 'robot0:S_thtip' + finger_names = [name for name in FINGERTIP_SITE_NAMES if name != thumb_name] + finger_name = self.np_random.choice(finger_names) + + thumb_idx = FINGERTIP_SITE_NAMES.index(thumb_name) + finger_idx = FINGERTIP_SITE_NAMES.index(finger_name) + assert thumb_idx != finger_idx + + # Pick a meeting point above the hand. + meeting_pos = self.palm_xpos + np.array([0.0, -0.09, 0.05]) + meeting_pos += self.np_random.normal(scale=0.005, size=meeting_pos.shape) + + # Slightly move meeting goal towards the respective finger to avoid that they + # overlap. + goal = self.initial_goal.copy().reshape(-1, 3) + for idx in [thumb_idx, finger_idx]: + offset_direction = (meeting_pos - goal[idx]) + offset_direction /= np.linalg.norm(offset_direction) + goal[idx] = meeting_pos - 0.005 * offset_direction + + if self.np_random.uniform() < 0.1: + # With some probability, ask all fingers to move back to the origin. + # This avoids that the thumb constantly stays near the goal position already. + goal = self.initial_goal.copy() + return goal.flatten() + + def _is_success(self, achieved_goal, desired_goal): + d = goal_distance(achieved_goal, desired_goal) + return (d < self.distance_threshold).astype(np.float32) + + def _render_callback(self): + # Visualize targets. + sites_offset = (self.sim.data.site_xpos - self.sim.model.site_pos).copy() + goal = self.goal.reshape(5, 3) + for finger_idx in range(5): + site_name = 'target{}'.format(finger_idx) + site_id = self.sim.model.site_name2id(site_name) + self.sim.model.site_pos[site_id] = goal[finger_idx] - sites_offset[site_id] + + # Visualize finger positions. + achieved_goal = self._get_achieved_goal().reshape(5, 3) + for finger_idx in range(5): + site_name = 'finger{}'.format(finger_idx) + site_id = self.sim.model.site_name2id(site_name) + self.sim.model.site_pos[site_id] = achieved_goal[finger_idx] - sites_offset[site_id] + self.sim.forward() diff --git a/src/gym/envs/robotics/hand_env.py b/src/gym/envs/robotics/hand_env.py new file mode 100644 index 0000000..2b068b0 --- /dev/null +++ b/src/gym/envs/robotics/hand_env.py @@ -0,0 +1,49 @@ +import os +import copy +import numpy as np + +import gym +from gym import error, spaces +from gym.utils import seeding +from gym.envs.robotics import robot_env + + +class HandEnv(robot_env.RobotEnv): + def __init__(self, model_path, n_substeps, initial_qpos, relative_control): + self.relative_control = relative_control + + super(HandEnv, self).__init__( + model_path=model_path, n_substeps=n_substeps, n_actions=20, + initial_qpos=initial_qpos) + + # RobotEnv methods + # ---------------------------- + + def _set_action(self, action): + assert action.shape == (20,) + + ctrlrange = self.sim.model.actuator_ctrlrange + actuation_range = (ctrlrange[:, 1] - ctrlrange[:, 0]) / 2. + if self.relative_control: + actuation_center = np.zeros_like(action) + for i in range(self.sim.data.ctrl.shape[0]): + actuation_center[i] = self.sim.data.get_joint_qpos( + self.sim.model.actuator_names[i].replace(':A_', ':')) + for joint_name in ['FF', 'MF', 'RF', 'LF']: + act_idx = self.sim.model.actuator_name2id( + 'robot0:A_{}J1'.format(joint_name)) + actuation_center[act_idx] += self.sim.data.get_joint_qpos( + 'robot0:{}J0'.format(joint_name)) + else: + actuation_center = (ctrlrange[:, 1] + ctrlrange[:, 0]) / 2. + self.sim.data.ctrl[:] = actuation_center + action * actuation_range + self.sim.data.ctrl[:] = np.clip(self.sim.data.ctrl, ctrlrange[:, 0], ctrlrange[:, 1]) + + def _viewer_setup(self): + body_id = self.sim.model.body_name2id('robot0:palm') + lookat = self.sim.data.body_xpos[body_id] + for idx, value in enumerate(lookat): + self.viewer.cam.lookat[idx] = value + self.viewer.cam.distance = 0.5 + self.viewer.cam.azimuth = 55. + self.viewer.cam.elevation = -25. diff --git a/src/gym/envs/robotics/robot_env.py b/src/gym/envs/robotics/robot_env.py new file mode 100644 index 0000000..6d07140 --- /dev/null +++ b/src/gym/envs/robotics/robot_env.py @@ -0,0 +1,162 @@ +import os +import copy +import numpy as np + +import gym +from gym import error, spaces +from gym.utils import seeding + +try: + import mujoco_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) + + +class RobotEnv(gym.GoalEnv): + def __init__(self, model_path, initial_qpos, n_actions, n_substeps): + if model_path.startswith('/'): + fullpath = model_path + else: + fullpath = os.path.join(os.path.dirname(__file__), 'assets', model_path) + if not os.path.exists(fullpath): + raise IOError('File {} does not exist'.format(fullpath)) + + model = mujoco_py.load_model_from_path(fullpath) + self.sim = mujoco_py.MjSim(model, nsubsteps=n_substeps) + self.viewer = None + + self.metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': int(np.round(1.0 / self.dt)) + } + + self.seed() + self._env_setup(initial_qpos=initial_qpos) + self.initial_state = copy.deepcopy(self.sim.get_state()) + + self.goal = self._sample_goal() + obs = self._get_obs() + self.action_space = spaces.Box(-1., 1., shape=(n_actions,), dtype='float32') + self.observation_space = spaces.Dict(dict( + desired_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'), + achieved_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'), + observation=spaces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32'), + )) + + @property + def dt(self): + return self.sim.model.opt.timestep * self.sim.nsubsteps + + # Env methods + # ---------------------------- + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + action = np.clip(action, self.action_space.low, self.action_space.high) + self._set_action(action) + self.sim.step() + self._step_callback() + obs = self._get_obs() + + done = False + info = { + 'is_success': self._is_success(obs['achieved_goal'], self.goal), + } + reward = self.compute_reward(obs['achieved_goal'], self.goal, info) + return obs, reward, done, info + + def reset(self): + # Attempt to reset the simulator. Since we randomize initial conditions, it + # is possible to get into a state with numerical issues (e.g. due to penetration or + # Gimbel lock) or we may not achieve an initial condition (e.g. an object is within the hand). + # In this case, we just keep randomizing until we eventually achieve a valid initial + # configuration. + did_reset_sim = False + while not did_reset_sim: + did_reset_sim = self._reset_sim() + self.goal = self._sample_goal().copy() + obs = self._get_obs() + return obs + + def close(self): + if self.viewer is not None: + # self.viewer.finish() + self.viewer = None + + def render(self, mode='human'): + self._render_callback() + if mode == 'rgb_array': + self._get_viewer().render() + # window size used for old mujoco-py: + width, height = 500, 500 + data = self._get_viewer().read_pixels(width, height, depth=False) + # original image is upside-down, so flip it + return data[::-1, :, :] + elif mode == 'human': + self._get_viewer().render() + + def _get_viewer(self): + if self.viewer is None: + self.viewer = mujoco_py.MjViewer(self.sim) + self._viewer_setup() + return self.viewer + + # Extension methods + # ---------------------------- + + def _reset_sim(self): + """Resets a simulation and indicates whether or not it was successful. + If a reset was unsuccessful (e.g. if a randomized state caused an error in the + simulation), this method should indicate such a failure by returning False. + In such a case, this method will be called again to attempt a the reset again. + """ + self.sim.set_state(self.initial_state) + self.sim.forward() + return True + + def _get_obs(self): + """Returns the observation. + """ + raise NotImplementedError() + + def _set_action(self, action): + """Applies the given action to the simulation. + """ + raise NotImplementedError() + + def _is_success(self, achieved_goal, desired_goal): + """Indicates whether or not the achieved goal successfully achieved the desired goal. + """ + raise NotImplementedError() + + def _sample_goal(self): + """Samples a new goal and returns it. + """ + raise NotImplementedError() + + def _env_setup(self, initial_qpos): + """Initial configuration of the environment. Can be used to configure initial state + and extract information from the simulation. + """ + pass + + def _viewer_setup(self): + """Initial configuration of the viewer. Can be used to set the camera position, + for example. + """ + pass + + def _render_callback(self): + """A custom callback that is called before rendering. Can be used + to implement custom visualizations. + """ + pass + + def _step_callback(self): + """A custom callback that is called after stepping the simulation. Can be used + to enforce additional constraints on the simulation state. + """ + pass diff --git a/src/gym/envs/robotics/rotations.py b/src/gym/envs/robotics/rotations.py new file mode 100644 index 0000000..4aafb64 --- /dev/null +++ b/src/gym/envs/robotics/rotations.py @@ -0,0 +1,369 @@ +# Copyright (c) 2009-2017, Matthew Brett and Christoph Gohlke +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Many methods borrow heavily or entirely from transforms3d: +# https://github.com/matthew-brett/transforms3d +# They have mostly been modified to support batched operations. + +import numpy as np +import itertools + +''' +Rotations +========= + +Note: these have caused many subtle bugs in the past. +Be careful while updating these methods and while using them in clever ways. + +See MuJoCo documentation here: http://mujoco.org/book/modeling.html#COrientation + +Conventions +----------- + - All functions accept batches as well as individual rotations + - All rotation conventions match respective MuJoCo defaults + - All angles are in radians + - Matricies follow LR convention + - Euler Angles are all relative with 'xyz' axes ordering + - See specific representation for more information + +Representations +--------------- + +Euler + There are many euler angle frames -- here we will strive to use the default + in MuJoCo, which is eulerseq='xyz'. + This frame is a relative rotating frame, about x, y, and z axes in order. + Relative rotating means that after we rotate about x, then we use the + new (rotated) y, and the same for z. + +Quaternions + These are defined in terms of rotation (angle) about a unit vector (x, y, z) + We use the following convention: + q0 = cos(angle / 2) + q1 = sin(angle / 2) * x + q2 = sin(angle / 2) * y + q3 = sin(angle / 2) * z + This is also sometimes called qw, qx, qy, qz. + Note that quaternions are ambiguous, because we can represent a rotation by + angle about vector and -angle about vector <-x, -y, -z>. + To choose between these, we pick "first nonzero positive", where we + make the first nonzero element of the quaternion positive. + This can result in mismatches if you're converting an quaternion that is not + "first nonzero positive" to a different representation and back. + +Axis Angle + (Not currently implemented) + These are very straightforward. Rotation is angle about a unit vector. + +XY Axes + (Not currently implemented) + We are given x axis and y axis, and z axis is cross product of x and y. + +Z Axis + This is NOT RECOMMENDED. Defines a unit vector for the Z axis, + but rotation about this axis is not well defined. + Instead pick a fixed reference direction for another axis (e.g. X) + and calculate the other (e.g. Y = Z cross-product X), + then use XY Axes rotation instead. + +SO3 + (Not currently implemented) + While not supported by MuJoCo, this representation has a lot of nice features. + We expect to add support for these in the future. + +TODO / Missing +-------------- + - Rotation integration or derivatives (e.g. velocity conversions) + - More representations (SO3, etc) + - Random sampling (e.g. sample uniform random rotation) + - Performance benchmarks/measurements + - (Maybe) define everything as to/from matricies, for simplicity +''' + +# For testing whether a number is close to zero +_FLOAT_EPS = np.finfo(np.float64).eps +_EPS4 = _FLOAT_EPS * 4.0 + + +def euler2mat(euler): + """ Convert Euler Angles to Rotation Matrix. See rotation.py for notes """ + euler = np.asarray(euler, dtype=np.float64) + assert euler.shape[-1] == 3, "Invalid shaped euler {}".format(euler) + + ai, aj, ak = -euler[..., 2], -euler[..., 1], -euler[..., 0] + si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak) + ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak) + cc, cs = ci * ck, ci * sk + sc, ss = si * ck, si * sk + + mat = np.empty(euler.shape[:-1] + (3, 3), dtype=np.float64) + mat[..., 2, 2] = cj * ck + mat[..., 2, 1] = sj * sc - cs + mat[..., 2, 0] = sj * cc + ss + mat[..., 1, 2] = cj * sk + mat[..., 1, 1] = sj * ss + cc + mat[..., 1, 0] = sj * cs - sc + mat[..., 0, 2] = -sj + mat[..., 0, 1] = cj * si + mat[..., 0, 0] = cj * ci + return mat + + +def euler2quat(euler): + """ Convert Euler Angles to Quaternions. See rotation.py for notes """ + euler = np.asarray(euler, dtype=np.float64) + assert euler.shape[-1] == 3, "Invalid shape euler {}".format(euler) + + ai, aj, ak = euler[..., 2] / 2, -euler[..., 1] / 2, euler[..., 0] / 2 + si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak) + ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak) + cc, cs = ci * ck, ci * sk + sc, ss = si * ck, si * sk + + quat = np.empty(euler.shape[:-1] + (4,), dtype=np.float64) + quat[..., 0] = cj * cc + sj * ss + quat[..., 3] = cj * sc - sj * cs + quat[..., 2] = -(cj * ss + sj * cc) + quat[..., 1] = cj * cs - sj * sc + return quat + + +def mat2euler(mat): + """ Convert Rotation Matrix to Euler Angles. See rotation.py for notes """ + mat = np.asarray(mat, dtype=np.float64) + assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat) + + cy = np.sqrt(mat[..., 2, 2] * mat[..., 2, 2] + mat[..., 1, 2] * mat[..., 1, 2]) + condition = cy > _EPS4 + euler = np.empty(mat.shape[:-1], dtype=np.float64) + euler[..., 2] = np.where(condition, + -np.arctan2(mat[..., 0, 1], mat[..., 0, 0]), + -np.arctan2(-mat[..., 1, 0], mat[..., 1, 1])) + euler[..., 1] = np.where(condition, + -np.arctan2(-mat[..., 0, 2], cy), + -np.arctan2(-mat[..., 0, 2], cy)) + euler[..., 0] = np.where(condition, + -np.arctan2(mat[..., 1, 2], mat[..., 2, 2]), + 0.0) + return euler + + +def mat2quat(mat): + """ Convert Rotation Matrix to Quaternion. See rotation.py for notes """ + mat = np.asarray(mat, dtype=np.float64) + assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat) + + Qxx, Qyx, Qzx = mat[..., 0, 0], mat[..., 0, 1], mat[..., 0, 2] + Qxy, Qyy, Qzy = mat[..., 1, 0], mat[..., 1, 1], mat[..., 1, 2] + Qxz, Qyz, Qzz = mat[..., 2, 0], mat[..., 2, 1], mat[..., 2, 2] + # Fill only lower half of symmetric matrix + K = np.zeros(mat.shape[:-2] + (4, 4), dtype=np.float64) + K[..., 0, 0] = Qxx - Qyy - Qzz + K[..., 1, 0] = Qyx + Qxy + K[..., 1, 1] = Qyy - Qxx - Qzz + K[..., 2, 0] = Qzx + Qxz + K[..., 2, 1] = Qzy + Qyz + K[..., 2, 2] = Qzz - Qxx - Qyy + K[..., 3, 0] = Qyz - Qzy + K[..., 3, 1] = Qzx - Qxz + K[..., 3, 2] = Qxy - Qyx + K[..., 3, 3] = Qxx + Qyy + Qzz + K /= 3.0 + # TODO: vectorize this -- probably could be made faster + q = np.empty(K.shape[:-2] + (4,)) + it = np.nditer(q[..., 0], flags=['multi_index']) + while not it.finished: + # Use Hermitian eigenvectors, values for speed + vals, vecs = np.linalg.eigh(K[it.multi_index]) + # Select largest eigenvector, reorder to w,x,y,z quaternion + q[it.multi_index] = vecs[[3, 0, 1, 2], np.argmax(vals)] + # Prefer quaternion with positive w + # (q * -1 corresponds to same rotation as q) + if q[it.multi_index][0] < 0: + q[it.multi_index] *= -1 + it.iternext() + return q + + +def quat2euler(quat): + """ Convert Quaternion to Euler Angles. See rotation.py for notes """ + return mat2euler(quat2mat(quat)) + + +def subtract_euler(e1, e2): + assert e1.shape == e2.shape + assert e1.shape[-1] == 3 + q1 = euler2quat(e1) + q2 = euler2quat(e2) + q_diff = quat_mul(q1, quat_conjugate(q2)) + return quat2euler(q_diff) + + +def quat2mat(quat): + """ Convert Quaternion to Euler Angles. See rotation.py for notes """ + quat = np.asarray(quat, dtype=np.float64) + assert quat.shape[-1] == 4, "Invalid shape quat {}".format(quat) + + w, x, y, z = quat[..., 0], quat[..., 1], quat[..., 2], quat[..., 3] + Nq = np.sum(quat * quat, axis=-1) + s = 2.0 / Nq + X, Y, Z = x * s, y * s, z * s + wX, wY, wZ = w * X, w * Y, w * Z + xX, xY, xZ = x * X, x * Y, x * Z + yY, yZ, zZ = y * Y, y * Z, z * Z + + mat = np.empty(quat.shape[:-1] + (3, 3), dtype=np.float64) + mat[..., 0, 0] = 1.0 - (yY + zZ) + mat[..., 0, 1] = xY - wZ + mat[..., 0, 2] = xZ + wY + mat[..., 1, 0] = xY + wZ + mat[..., 1, 1] = 1.0 - (xX + zZ) + mat[..., 1, 2] = yZ - wX + mat[..., 2, 0] = xZ - wY + mat[..., 2, 1] = yZ + wX + mat[..., 2, 2] = 1.0 - (xX + yY) + return np.where((Nq > _FLOAT_EPS)[..., np.newaxis, np.newaxis], mat, np.eye(3)) + +def quat_conjugate(q): + inv_q = -q + inv_q[..., 0] *= -1 + return inv_q + +def quat_mul(q0, q1): + assert q0.shape == q1.shape + assert q0.shape[-1] == 4 + assert q1.shape[-1] == 4 + + w0 = q0[..., 0] + x0 = q0[..., 1] + y0 = q0[..., 2] + z0 = q0[..., 3] + + w1 = q1[..., 0] + x1 = q1[..., 1] + y1 = q1[..., 2] + z1 = q1[..., 3] + + w = w0 * w1 - x0 * x1 - y0 * y1 - z0 * z1 + x = w0 * x1 + x0 * w1 + y0 * z1 - z0 * y1 + y = w0 * y1 + y0 * w1 + z0 * x1 - x0 * z1 + z = w0 * z1 + z0 * w1 + x0 * y1 - y0 * x1 + q = np.array([w, x, y, z]) + if q.ndim == 2: + q = q.swapaxes(0, 1) + assert q.shape == q0.shape + return q + +def quat_rot_vec(q, v0): + q_v0 = np.array([0, v0[0], v0[1], v0[2]]) + q_v = quat_mul(q, quat_mul(q_v0, quat_conjugate(q))) + v = q_v[1:] + return v + +def quat_identity(): + return np.array([1, 0, 0, 0]) + +def quat2axisangle(quat): + theta = 0; + axis = np.array([0, 0, 1]); + sin_theta = np.linalg.norm(quat[1:]) + + if (sin_theta > 0.0001): + theta = 2 * np.arcsin(sin_theta) + theta *= 1 if quat[0] >= 0 else -1 + axis = quat[1:] / sin_theta + + return axis, theta + +def euler2point_euler(euler): + _euler = euler.copy() + if len(_euler.shape) < 2: + _euler = np.expand_dims(_euler,0) + assert(_euler.shape[1] == 3) + _euler_sin = np.sin(_euler) + _euler_cos = np.cos(_euler) + return np.concatenate([_euler_sin, _euler_cos], axis=-1) + +def point_euler2euler(euler): + _euler = euler.copy() + if len(_euler.shape) < 2: + _euler = np.expand_dims(_euler,0) + assert(_euler.shape[1] == 6) + angle = np.arctan(_euler[..., :3] / _euler[..., 3:]) + angle[_euler[..., 3:] < 0] += np.pi + return angle + +def quat2point_quat(quat): + # Should be in qw, qx, qy, qz + _quat = quat.copy() + if len(_quat.shape) < 2: + _quat = np.expand_dims(_quat, 0) + assert(_quat.shape[1] == 4) + angle = np.arccos(_quat[:,[0]]) * 2 + xyz = _quat[:, 1:] + xyz[np.squeeze(np.abs(np.sin(angle/2))) >= 1e-5] = (xyz / np.sin(angle / 2))[np.squeeze(np.abs(np.sin(angle/2))) >= 1e-5] + return np.concatenate([np.sin(angle),np.cos(angle), xyz], axis=-1) + +def point_quat2quat(quat): + _quat = quat.copy() + if len(_quat.shape) < 2: + _quat = np.expand_dims(_quat, 0) + assert(_quat.shape[1] == 5) + angle = np.arctan(_quat[:,[0]] / _quat[:,[1]]) + qw = np.cos(angle / 2) + + qxyz = _quat[:, 2:] + qxyz[np.squeeze(np.abs(np.sin(angle/2))) >= 1e-5] = (qxyz * np.sin(angle/2))[np.squeeze(np.abs(np.sin(angle/2))) >= 1e-5] + return np.concatenate([qw, qxyz], axis=-1) + +def normalize_angles(angles): + '''Puts angles in [-pi, pi] range.''' + angles = angles.copy() + if angles.size > 0: + angles = (angles + np.pi) % (2 * np.pi) - np.pi + assert -np.pi-1e-6 <= angles.min() and angles.max() <= np.pi+1e-6 + return angles + +def round_to_straight_angles(angles): + '''Returns closest angle modulo 90 degrees ''' + angles = np.round(angles / (np.pi / 2)) * (np.pi / 2) + return normalize_angles(angles) + +def get_parallel_rotations(): + mult90 = [0, np.pi/2, -np.pi/2, np.pi] + parallel_rotations = [] + for euler in itertools.product(mult90, repeat=3): + canonical = mat2euler(euler2mat(euler)) + canonical = np.round(canonical / (np.pi / 2)) + if canonical[0] == -2: + canonical[0] = 2 + if canonical[2] == -2: + canonical[2] = 2 + canonical *= np.pi / 2 + if all([(canonical != rot).any() for rot in parallel_rotations]): + parallel_rotations += [canonical] + assert len(parallel_rotations) == 24 + return parallel_rotations diff --git a/src/gym/envs/robotics/utils.py b/src/gym/envs/robotics/utils.py new file mode 100644 index 0000000..a73e5f6 --- /dev/null +++ b/src/gym/envs/robotics/utils.py @@ -0,0 +1,96 @@ +import numpy as np + +from gym import error +try: + import mujoco_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) + + +def robot_get_obs(sim): + """Returns all joint positions and velocities associated with + a robot. + """ + if sim.data.qpos is not None and sim.model.joint_names: + names = [n for n in sim.model.joint_names if n.startswith('robot')] + return ( + np.array([sim.data.get_joint_qpos(name) for name in names]), + np.array([sim.data.get_joint_qvel(name) for name in names]), + ) + return np.zeros(0), np.zeros(0) + + +def ctrl_set_action(sim, action): + """For torque actuators it copies the action into mujoco ctrl field. + For position actuators it sets the target relative to the current qpos. + """ + if sim.model.nmocap > 0: + _, action = np.split(action, (sim.model.nmocap * 7, )) + if sim.data.ctrl is not None: + for i in range(action.shape[0]): + if sim.model.actuator_biastype[i] == 0: + sim.data.ctrl[i] = action[i] + else: + idx = sim.model.jnt_qposadr[sim.model.actuator_trnid[i, 0]] + sim.data.ctrl[i] = sim.data.qpos[idx] + action[i] + + +def mocap_set_action(sim, action): + """The action controls the robot using mocaps. Specifically, bodies + on the robot (for example the gripper wrist) is controlled with + mocap bodies. In this case the action is the desired difference + in position and orientation (quaternion), in world coordinates, + of the of the target body. The mocap is positioned relative to + the target body according to the delta, and the MuJoCo equality + constraint optimizer tries to center the welded body on the mocap. + """ + if sim.model.nmocap > 0: + action, _ = np.split(action, (sim.model.nmocap * 7, )) + action = action.reshape(sim.model.nmocap, 7) + + pos_delta = action[:, :3] + quat_delta = action[:, 3:] + + reset_mocap2body_xpos(sim) + sim.data.mocap_pos[:] = sim.data.mocap_pos + pos_delta + sim.data.mocap_quat[:] = sim.data.mocap_quat + quat_delta + + +def reset_mocap_welds(sim): + """Resets the mocap welds that we use for actuation. + """ + if sim.model.nmocap > 0 and sim.model.eq_data is not None: + for i in range(sim.model.eq_data.shape[0]): + if sim.model.eq_type[i] == mujoco_py.const.EQ_WELD: + sim.model.eq_data[i, :] = np.array( + [0., 0., 0., 1., 0., 0., 0.]) + sim.forward() + + +def reset_mocap2body_xpos(sim): + """Resets the position and orientation of the mocap bodies to the same + values as the bodies they're welded to. + """ + + if (sim.model.eq_type is None or + sim.model.eq_obj1id is None or + sim.model.eq_obj2id is None): + return + for eq_type, obj1_id, obj2_id in zip(sim.model.eq_type, + sim.model.eq_obj1id, + sim.model.eq_obj2id): + if eq_type != mujoco_py.const.EQ_WELD: + continue + + mocap_id = sim.model.body_mocapid[obj1_id] + if mocap_id != -1: + # obj1 is the mocap, obj2 is the welded body + body_idx = obj2_id + else: + # obj2 is the mocap, obj1 is the welded body + mocap_id = sim.model.body_mocapid[obj2_id] + body_idx = obj1_id + + assert (mocap_id != -1) + sim.data.mocap_pos[mocap_id][:] = sim.data.body_xpos[body_idx] + sim.data.mocap_quat[mocap_id][:] = sim.data.body_xquat[body_idx] diff --git a/src/gym/envs/tests/__init__.py b/src/gym/envs/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/envs/tests/spec_list.py b/src/gym/envs/tests/spec_list.py new file mode 100644 index 0000000..f71b7c3 --- /dev/null +++ b/src/gym/envs/tests/spec_list.py @@ -0,0 +1,20 @@ +from gym import envs, logger +import os + +def should_skip_env_spec_for_tests(spec): + # We skip tests for envs that require dependencies or are otherwise + # troublesome to run frequently + ep = spec._entry_point + # Skip mujoco tests for pull request CI + skip_mujoco = not (os.environ.get('MUJOCO_KEY_BUNDLE') or os.path.exists(os.path.expanduser('~/.mujoco/mjkey.txt'))) + if skip_mujoco and (ep.startswith('gym.envs.mujoco:') or ep.startswith('gym.envs.robotics:')): + return True + if ( 'GoEnv' in ep or + 'HexEnv' in ep or + (ep.startswith("gym.envs.atari") and not spec.id.startswith("Pong") and not spec.id.startswith("Seaquest")) + ): + logger.warn("Skipping tests for env {}".format(ep)) + return True + return False + +spec_list = [spec for spec in sorted(envs.registry.all(), key=lambda x: x.id) if spec._entry_point is not None and not should_skip_env_spec_for_tests(spec)] diff --git a/src/gym/envs/tests/test_determinism.py b/src/gym/envs/tests/test_determinism.py new file mode 100644 index 0000000..f88eb7c --- /dev/null +++ b/src/gym/envs/tests/test_determinism.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest +from gym import spaces +from gym.envs.tests.spec_list import spec_list + +@pytest.mark.parametrize("spec", spec_list) +def test_env(spec): + + # Note that this precludes running this test in multiple + # threads. However, we probably already can't do multithreading + # due to some environments. + spaces.seed(0) + + env1 = spec.make() + env1.seed(0) + action_samples1 = [env1.action_space.sample() for i in range(4)] + initial_observation1 = env1.reset() + step_responses1 = [env1.step(action) for action in action_samples1] + env1.close() + + spaces.seed(0) + + env2 = spec.make() + env2.seed(0) + action_samples2 = [env2.action_space.sample() for i in range(4)] + initial_observation2 = env2.reset() + step_responses2 = [env2.step(action) for action in action_samples2] + env2.close() + + for i, (action_sample1, action_sample2) in enumerate(zip(action_samples1, action_samples2)): + try: + assert_equals(action_sample1, action_sample2) + except AssertionError: + print('env1.action_space=', env1.action_space) + print('env2.action_space=', env2.action_space) + print('action_samples1=', action_samples1) + print('action_samples2=', action_samples2) + print('[{}] action_sample1: {}, action_sample2: {}'.format(i, action_sample1, action_sample2)) + raise + + # Don't check rollout equality if it's a a nondeterministic + # environment. + if spec.nondeterministic: + return + + assert_equals(initial_observation1, initial_observation2) + + for i, ((o1, r1, d1, i1), (o2, r2, d2, i2)) in enumerate(zip(step_responses1, step_responses2)): + assert_equals(o1, o2, '[{}] '.format(i)) + assert r1 == r2, '[{}] r1: {}, r2: {}'.format(i, r1, r2) + assert d1 == d2, '[{}] d1: {}, d2: {}'.format(i, d1, d2) + + # Go returns a Pachi game board in info, which doesn't + # properly check equality. For now, we hack around this by + # just skipping Go. + if spec.id not in ['Go9x9-v0', 'Go19x19-v0']: + assert_equals(i1, i2, '[{}] '.format(i)) + +def assert_equals(a, b, prefix=None): + assert type(a) == type(b), "{}Differing types: {} and {}".format(prefix, a, b) + if isinstance(a, dict): + assert list(a.keys()) == list(b.keys()), "{}Key sets differ: {} and {}".format(prefix, a, b) + + for k in a.keys(): + v_a = a[k] + v_b = b[k] + assert_equals(v_a, v_b) + elif isinstance(a, np.ndarray): + np.testing.assert_array_equal(a, b) + elif isinstance(a, tuple): + for elem_from_a, elem_from_b in zip(a, b): + assert_equals(elem_from_a, elem_from_b) + else: + assert a == b diff --git a/src/gym/envs/tests/test_envs.py b/src/gym/envs/tests/test_envs.py new file mode 100644 index 0000000..1f19fef --- /dev/null +++ b/src/gym/envs/tests/test_envs.py @@ -0,0 +1,43 @@ +import numpy as np +import pytest +from gym import envs +from gym.envs.tests.spec_list import spec_list + +# This runs a smoketest on each official registered env. We may want +# to try also running environments which are not officially registered +# envs. +@pytest.mark.parametrize("spec", spec_list) +def test_env(spec): + env = spec.make() + ob_space = env.observation_space + act_space = env.action_space + ob = env.reset() + assert ob_space.contains(ob), 'Reset observation: {!r} not in space'.format(ob) + a = act_space.sample() + observation, reward, done, _info = env.step(a) + assert ob_space.contains(observation), 'Step observation: {!r} not in space'.format(observation) + assert np.isscalar(reward), "{} is not a scalar for {}".format(reward, env) + assert isinstance(done, bool), "Expected {} to be a boolean".format(done) + + for mode in env.metadata.get('render.modes', []): + env.render(mode=mode) + + # Make sure we can render the environment after close. + for mode in env.metadata.get('render.modes', []): + env.render(mode=mode) + + env.close() + +# Run a longer rollout on some environments +def test_random_rollout(): + for env in [envs.make('CartPole-v0'), envs.make('FrozenLake-v0')]: + agent = lambda ob: env.action_space.sample() + ob = env.reset() + for _ in range(10): + assert env.observation_space.contains(ob) + a = agent(ob) + assert env.action_space.contains(a) + (ob, _reward, done, _info) = env.step(a) + if done: break + env.close() + diff --git a/src/gym/envs/tests/test_envs_semantics.py b/src/gym/envs/tests/test_envs_semantics.py new file mode 100644 index 0000000..a6e5b79 --- /dev/null +++ b/src/gym/envs/tests/test_envs_semantics.py @@ -0,0 +1,95 @@ +""" +Currently disabled since this was done in a very poor way +Hashed str representation of objects +""" + + +from __future__ import unicode_literals +import json +import hashlib +import os + +import pytest +from gym import spaces, logger +from gym.envs.tests.spec_list import spec_list + +DATA_DIR = os.path.dirname(__file__) +ROLLOUT_STEPS = 100 +episodes = ROLLOUT_STEPS +steps = ROLLOUT_STEPS + +ROLLOUT_FILE = os.path.join(DATA_DIR, 'rollout.json') + +if not os.path.isfile(ROLLOUT_FILE): + with open(ROLLOUT_FILE, "w") as outfile: + json.dump({}, outfile, indent=2) + +def hash_object(unhashed): + return hashlib.sha256(str(unhashed).encode('utf-16')).hexdigest() # This is really bad, str could be same while values change + +def generate_rollout_hash(spec): + spaces.seed(0) + env = spec.make() + env.seed(0) + + observation_list = [] + action_list = [] + reward_list = [] + done_list = [] + + total_steps = 0 + for episode in range(episodes): + if total_steps >= ROLLOUT_STEPS: break + observation = env.reset() + + for step in range(steps): + action = env.action_space.sample() + observation, reward, done, _ = env.step(action) + + action_list.append(action) + observation_list.append(observation) + reward_list.append(reward) + done_list.append(done) + + total_steps += 1 + if total_steps >= ROLLOUT_STEPS: break + + if done: break + + observations_hash = hash_object(observation_list) + actions_hash = hash_object(action_list) + rewards_hash = hash_object(reward_list) + dones_hash = hash_object(done_list) + + env.close() + return observations_hash, actions_hash, rewards_hash, dones_hash + +@pytest.mark.parametrize("spec", spec_list) +def test_env_semantics(spec): + logger.warn("Skipping this test. Existing hashes were generated in a bad way") + return + with open(ROLLOUT_FILE) as data_file: + rollout_dict = json.load(data_file) + + if spec.id not in rollout_dict: + if not spec.nondeterministic: + logger.warn("Rollout does not exist for {}, run generate_json.py to generate rollouts for new envs".format(spec.id)) + return + + logger.info("Testing rollout for {} environment...".format(spec.id)) + + observations_now, actions_now, rewards_now, dones_now = generate_rollout_hash(spec) + + errors = [] + if rollout_dict[spec.id]['observations'] != observations_now: + errors.append('Observations not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['observations'], observations_now)) + if rollout_dict[spec.id]['actions'] != actions_now: + errors.append('Actions not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['actions'], actions_now)) + if rollout_dict[spec.id]['rewards'] != rewards_now: + errors.append('Rewards not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['rewards'], rewards_now)) + if rollout_dict[spec.id]['dones'] != dones_now: + errors.append('Dones not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['dones'], dones_now)) + if len(errors): + for error in errors: + logger.warn(error) + raise ValueError(errors) diff --git a/src/gym/envs/tests/test_registration.py b/src/gym/envs/tests/test_registration.py new file mode 100644 index 0000000..a7990bb --- /dev/null +++ b/src/gym/envs/tests/test_registration.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +from gym import error, envs +from gym.envs import registration +from gym.envs.classic_control import cartpole + +def test_make(): + env = envs.make('CartPole-v0') + assert env.spec.id == 'CartPole-v0' + assert isinstance(env.unwrapped, cartpole.CartPoleEnv) + +def test_make_deprecated(): + try: + envs.make('Humanoid-v0') + except error.Error: + pass + else: + assert False + +def test_spec(): + spec = envs.spec('CartPole-v0') + assert spec.id == 'CartPole-v0' + +def test_missing_lookup(): + registry = registration.EnvRegistry() + registry.register(id='Test-v0', entry_point=None) + registry.register(id='Test-v15', entry_point=None) + registry.register(id='Test-v9', entry_point=None) + registry.register(id='Other-v100', entry_point=None) + try: + registry.spec('Test-v1') # must match an env name but not the version above + except error.DeprecatedEnv: + pass + else: + assert False + + try: + registry.spec('Unknown-v1') + except error.UnregisteredEnv: + pass + else: + assert False + +def test_malformed_lookup(): + registry = registration.EnvRegistry() + try: + registry.spec(u'“Breakout-v0”') + except error.Error as e: + assert 'malformed environment ID' in '{}'.format(e), 'Unexpected message: {}'.format(e) + else: + assert False diff --git a/src/gym/envs/toy_text/__init__.py b/src/gym/envs/toy_text/__init__.py new file mode 100644 index 0000000..c1d76eb --- /dev/null +++ b/src/gym/envs/toy_text/__init__.py @@ -0,0 +1,9 @@ +from gym.envs.toy_text.blackjack import BlackjackEnv +from gym.envs.toy_text.roulette import RouletteEnv +from gym.envs.toy_text.frozen_lake import FrozenLakeEnv +from gym.envs.toy_text.nchain import NChainEnv +from gym.envs.toy_text.hotter_colder import HotterColder +from gym.envs.toy_text.guessing_game import GuessingGame +from gym.envs.toy_text.kellycoinflip import KellyCoinflipEnv +from gym.envs.toy_text.kellycoinflip import KellyCoinflipGeneralizedEnv +from gym.envs.toy_text.cliffwalking import CliffWalkingEnv diff --git a/src/gym/envs/toy_text/blackjack.py b/src/gym/envs/toy_text/blackjack.py new file mode 100644 index 0000000..0cb6193 --- /dev/null +++ b/src/gym/envs/toy_text/blackjack.py @@ -0,0 +1,116 @@ +import gym +from gym import spaces +from gym.utils import seeding + +def cmp(a, b): + return float(a > b) - float(a < b) + +# 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10 +deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] + + +def draw_card(np_random): + return int(np_random.choice(deck)) + + +def draw_hand(np_random): + return [draw_card(np_random), draw_card(np_random)] + + +def usable_ace(hand): # Does this hand have a usable ace? + return 1 in hand and sum(hand) + 10 <= 21 + + +def sum_hand(hand): # Return current hand total + if usable_ace(hand): + return sum(hand) + 10 + return sum(hand) + + +def is_bust(hand): # Is this hand a bust? + return sum_hand(hand) > 21 + + +def score(hand): # What is the score of this hand (0 if bust) + return 0 if is_bust(hand) else sum_hand(hand) + + +def is_natural(hand): # Is this hand a natural blackjack? + return sorted(hand) == [1, 10] + + +class BlackjackEnv(gym.Env): + """Simple blackjack environment + + Blackjack is a card game where the goal is to obtain cards that sum to as + near as possible to 21 without going over. They're playing against a fixed + dealer. + Face cards (Jack, Queen, King) have point value 10. + Aces can either count as 11 or 1, and it's called 'usable' at 11. + This game is placed with an infinite deck (or with replacement). + The game starts with each (player and dealer) having one face up and one + face down card. + + The player can request additional cards (hit=1) until they decide to stop + (stick=0) or exceed 21 (bust). + + After the player sticks, the dealer reveals their facedown card, and draws + until their sum is 17 or greater. If the dealer goes bust the player wins. + + If neither player nor dealer busts, the outcome (win, lose, draw) is + decided by whose sum is closer to 21. The reward for winning is +1, + drawing is 0, and losing is -1. + + The observation of a 3-tuple of: the players current sum, + the dealer's one showing card (1-10 where 1 is ace), + and whether or not the player holds a usable ace (0 or 1). + + This environment corresponds to the version of the blackjack problem + described in Example 5.1 in Reinforcement Learning: An Introduction + by Sutton and Barto. + http://incompleteideas.net/book/the-book-2nd.html + """ + def __init__(self, natural=False): + self.action_space = spaces.Discrete(2) + self.observation_space = spaces.Tuple(( + spaces.Discrete(32), + spaces.Discrete(11), + spaces.Discrete(2))) + self.seed() + + # Flag to payout 1.5 on a "natural" blackjack win, like casino rules + # Ref: http://www.bicyclecards.com/how-to-play/blackjack/ + self.natural = natural + # Start the first game + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action) + if action: # hit: add a card to players hand and return + self.player.append(draw_card(self.np_random)) + if is_bust(self.player): + done = True + reward = -1 + else: + done = False + reward = 0 + else: # stick: play out the dealers hand, and score + done = True + while sum_hand(self.dealer) < 17: + self.dealer.append(draw_card(self.np_random)) + reward = cmp(score(self.player), score(self.dealer)) + if self.natural and is_natural(self.player) and reward == 1: + reward = 1.5 + return self._get_obs(), reward, done, {} + + def _get_obs(self): + return (sum_hand(self.player), self.dealer[0], usable_ace(self.player)) + + def reset(self): + self.dealer = draw_hand(self.np_random) + self.player = draw_hand(self.np_random) + return self._get_obs() diff --git a/src/gym/envs/toy_text/cliffwalking.py b/src/gym/envs/toy_text/cliffwalking.py new file mode 100644 index 0000000..2b1aedb --- /dev/null +++ b/src/gym/envs/toy_text/cliffwalking.py @@ -0,0 +1,113 @@ +import numpy as np +import sys +from gym.envs.toy_text import discrete + +UP = 0 +RIGHT = 1 +DOWN = 2 +LEFT = 3 + + +class CliffWalkingEnv(discrete.DiscreteEnv): + """ + This is a simple implementation of the Gridworld Cliff + reinforcement learning task. + + Adapted from Example 6.6 (page 132) from Reinforcement Learning: An Introduction + by Sutton and Barto: + http://incompleteideas.net/book/the-book-2nd.html + + With inspiration from: + https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py + + The board is a 4x12 matrix, with (using Numpy matrix indexing): + [3, 0] as the start at bottom-left + [3, 11] as the goal at bottom-right + [3, 1..10] as the cliff at bottom-center + + Each time step incurs -1 reward, and stepping into the cliff incurs -100 reward + and a reset to the start. An episode terminates when the agent reaches the goal. + """ + metadata = {'render.modes': ['human', 'ansi']} + + def __init__(self): + self.shape = (4, 12) + self.start_state_index = np.ravel_multi_index((3, 0), self.shape) + + nS = np.prod(self.shape) + nA = 4 + + # Cliff Location + self._cliff = np.zeros(self.shape, dtype=np.bool) + self._cliff[3, 1:-1] = True + + # Calculate transition probabilities and rewards + P = {} + for s in range(nS): + position = np.unravel_index(s, self.shape) + P[s] = {a: [] for a in range(nA)} + P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) + P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) + P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) + P[s][LEFT] = self._calculate_transition_prob(position, [0, -1]) + + # Calculate initial state distribution + # We always start in state (3, 0) + isd = np.zeros(nS) + isd[self.start_state_index] = 1.0 + + super(CliffWalkingEnv, self).__init__(nS, nA, P, isd) + + def _limit_coordinates(self, coord): + """ + Prevent the agent from falling out of the grid world + :param coord: + :return: + """ + coord[0] = min(coord[0], self.shape[0] - 1) + coord[0] = max(coord[0], 0) + coord[1] = min(coord[1], self.shape[1] - 1) + coord[1] = max(coord[1], 0) + return coord + + def _calculate_transition_prob(self, current, delta): + """ + Determine the outcome for an action. Transition Prob is always 1.0. + :param current: Current position on the grid as (row, col) + :param delta: Change in position for transition + :return: (1.0, new_state, reward, done) + """ + new_position = np.array(current) + np.array(delta) + new_position = self._limit_coordinates(new_position).astype(int) + new_state = np.ravel_multi_index(tuple(new_position), self.shape) + if self._cliff[tuple(new_position)]: + return [(1.0, self.start_state_index, -100, False)] + + terminal_state = (self.shape[0] - 1, self.shape[1] - 1) + is_done = tuple(new_position) == terminal_state + return [(1.0, new_state, -1, is_done)] + + def render(self, mode='human'): + outfile = sys.stdout + + for s in range(self.nS): + position = np.unravel_index(s, self.shape) + if self.s == s: + output = " x " + # Print terminal state + elif position == (3, 11): + output = " T " + elif self._cliff[position]: + output = " C " + else: + output = " o " + + if position[1] == 0: + output = output.lstrip() + if position[1] == self.shape[1] - 1: + output = output.rstrip() + output += '\n' + + outfile.write(output) + outfile.write('\n') + diff --git a/src/gym/envs/toy_text/discrete.py b/src/gym/envs/toy_text/discrete.py new file mode 100644 index 0000000..3a3c82e --- /dev/null +++ b/src/gym/envs/toy_text/discrete.py @@ -0,0 +1,59 @@ +import numpy as np + +from gym import Env, spaces +from gym.utils import seeding + +def categorical_sample(prob_n, np_random): + """ + Sample from categorical distribution + Each row specifies class probabilities + """ + prob_n = np.asarray(prob_n) + csprob_n = np.cumsum(prob_n) + return (csprob_n > np_random.rand()).argmax() + + +class DiscreteEnv(Env): + + """ + Has the following members + - nS: number of states + - nA: number of actions + - P: transitions (*) + - isd: initial state distribution (**) + + (*) dictionary dict of dicts of lists, where + P[s][a] == [(probability, nextstate, reward, done), ...] + (**) list or array of length nS + + + """ + def __init__(self, nS, nA, P, isd): + self.P = P + self.isd = isd + self.lastaction=None # for rendering + self.nS = nS + self.nA = nA + + self.action_space = spaces.Discrete(self.nA) + self.observation_space = spaces.Discrete(self.nS) + + self.seed() + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def reset(self): + self.s = categorical_sample(self.isd, self.np_random) + self.lastaction=None + return self.s + + def step(self, a): + transitions = self.P[self.s][a] + i = categorical_sample([t[0] for t in transitions], self.np_random) + p, s, r, d= transitions[i] + self.s = s + self.lastaction=a + return (s, r, d, {"prob" : p}) diff --git a/src/gym/envs/toy_text/frozen_lake.py b/src/gym/envs/toy_text/frozen_lake.py new file mode 100644 index 0000000..37c803f --- /dev/null +++ b/src/gym/envs/toy_text/frozen_lake.py @@ -0,0 +1,132 @@ +import numpy as np +import sys +from six import StringIO, b + +from gym import utils +from gym.envs.toy_text import discrete + +LEFT = 0 +DOWN = 1 +RIGHT = 2 +UP = 3 + +MAPS = { + "4x4": [ + "SFFF", + "FHFH", + "FFFH", + "HFFG" + ], + "8x8": [ + "SFFFFFFF", + "FFFFFFFF", + "FFFHFFFF", + "FFFFFHFF", + "FFFHFFFF", + "FHHFFFHF", + "FHFFHFHF", + "FFFHFFFG" + ], +} + +class FrozenLakeEnv(discrete.DiscreteEnv): + """ + Winter is here. You and your friends were tossing around a frisbee at the park + when you made a wild throw that left the frisbee out in the middle of the lake. + The water is mostly frozen, but there are a few holes where the ice has melted. + If you step into one of those holes, you'll fall into the freezing water. + At this time, there's an international frisbee shortage, so it's absolutely imperative that + you navigate across the lake and retrieve the disc. + However, the ice is slippery, so you won't always move in the direction you intend. + The surface is described using a grid like the following + + SFFF + FHFH + FFFH + HFFG + + S : starting point, safe + F : frozen surface, safe + H : hole, fall to your doom + G : goal, where the frisbee is located + + The episode ends when you reach the goal or fall in a hole. + You receive a reward of 1 if you reach the goal, and zero otherwise. + + """ + + metadata = {'render.modes': ['human', 'ansi']} + + def __init__(self, desc=None, map_name="4x4",is_slippery=True): + if desc is None and map_name is None: + raise ValueError('Must provide either desc or map_name') + elif desc is None: + desc = MAPS[map_name] + self.desc = desc = np.asarray(desc,dtype='c') + self.nrow, self.ncol = nrow, ncol = desc.shape + self.reward_range = (0, 1) + + nA = 4 + nS = nrow * ncol + + isd = np.array(desc == b'S').astype('float64').ravel() + isd /= isd.sum() + + P = {s : {a : [] for a in range(nA)} for s in range(nS)} + + def to_s(row, col): + return row*ncol + col + + def inc(row, col, a): + if a==0: # left + col = max(col-1,0) + elif a==1: # down + row = min(row+1,nrow-1) + elif a==2: # right + col = min(col+1,ncol-1) + elif a==3: # up + row = max(row-1,0) + return (row, col) + + for row in range(nrow): + for col in range(ncol): + s = to_s(row, col) + for a in range(4): + li = P[s][a] + letter = desc[row, col] + if letter in b'GH': + li.append((1.0, s, 0, True)) + else: + if is_slippery: + for b in [(a-1)%4, a, (a+1)%4]: + newrow, newcol = inc(row, col, b) + newstate = to_s(newrow, newcol) + newletter = desc[newrow, newcol] + done = bytes(newletter) in b'GH' + rew = float(newletter == b'G') + li.append((1.0/3.0, newstate, rew, done)) + else: + newrow, newcol = inc(row, col, a) + newstate = to_s(newrow, newcol) + newletter = desc[newrow, newcol] + done = bytes(newletter) in b'GH' + rew = float(newletter == b'G') + li.append((1.0, newstate, rew, done)) + + super(FrozenLakeEnv, self).__init__(nS, nA, P, isd) + + def render(self, mode='human'): + outfile = StringIO() if mode == 'ansi' else sys.stdout + + row, col = self.s // self.ncol, self.s % self.ncol + desc = self.desc.tolist() + desc = [[c.decode('utf-8') for c in line] for line in desc] + desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) + if self.lastaction is not None: + outfile.write(" ({})\n".format(["Left","Down","Right","Up"][self.lastaction])) + else: + outfile.write("\n") + outfile.write("\n".join(''.join(line) for line in desc)+"\n") + + if mode != 'human': + return outfile diff --git a/src/gym/envs/toy_text/guessing_game.py b/src/gym/envs/toy_text/guessing_game.py new file mode 100644 index 0000000..9906ded --- /dev/null +++ b/src/gym/envs/toy_text/guessing_game.py @@ -0,0 +1,87 @@ +import gym +from gym import spaces +from gym.utils import seeding +import numpy as np + + +class GuessingGame(gym.Env): + """Number guessing game + + The object of the game is to guess within 1% of the randomly chosen number + within 200 time steps + + After each step the agent is provided with one of four possible observations + which indicate where the guess is in relation to the randomly chosen number + + 0 - No guess yet submitted (only after reset) + 1 - Guess is lower than the target + 2 - Guess is equal to the target + 3 - Guess is higher than the target + + The rewards are: + 0 if the agent's guess is outside of 1% of the target + 1 if the agent's guess is inside 1% of the target + + The episode terminates after the agent guesses within 1% of the target or + 200 steps have been taken + + The agent will need to use a memory of previously submitted actions and observations + in order to efficiently explore the available actions + + The purpose is to have agents optimise their exploration parameters (e.g. how far to + explore from previous actions) based on previous experience. Because the goal changes + each episode a state-value or action-value function isn't able to provide any additional + benefit apart from being able to tell whether to increase or decrease the next guess. + + The perfect agent would likely learn the bounds of the action space (without referring + to them explicitly) and then follow binary tree style exploration towards to goal number + """ + def __init__(self): + self.range = 1000 # Randomly selected number is within +/- this value + self.bounds = 10000 + + self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds])) + self.observation_space = spaces.Discrete(4) + + self.number = 0 + self.guess_count = 0 + self.guess_max = 200 + self.observation = 0 + + self.seed() + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action) + + if action < self.number: + self.observation = 1 + + elif action == self.number: + self.observation = 2 + + elif action > self.number: + self.observation = 3 + + reward = 0 + done = False + + if (self.number - self.range * 0.01) < action < (self.number + self.range * 0.01): + reward = 1 + done = True + + self.guess_count += 1 + if self.guess_count >= self.guess_max: + done = True + + return self.observation, reward, done, {"number": self.number, "guesses": self.guess_count} + + def reset(self): + self.number = self.np_random.uniform(-self.range, self.range) + self.guess_count = 0 + self.observation = 0 + return self.observation diff --git a/src/gym/envs/toy_text/hotter_colder.py b/src/gym/envs/toy_text/hotter_colder.py new file mode 100644 index 0000000..6f8e826 --- /dev/null +++ b/src/gym/envs/toy_text/hotter_colder.py @@ -0,0 +1,66 @@ +import gym +from gym import spaces +from gym.utils import seeding +import numpy as np + + +class HotterColder(gym.Env): + """Hotter Colder + The goal of hotter colder is to guess closer to a randomly selected number + + After each step the agent receives an observation of: + 0 - No guess yet submitted (only after reset) + 1 - Guess is lower than the target + 2 - Guess is equal to the target + 3 - Guess is higher than the target + + The rewards is calculated as: + (min(action, self.number) + self.range) / (max(action, self.number) + self.range) + + Ideally an agent will be able to recognise the 'scent' of a higher reward and + increase the rate in which is guesses in that direction until the reward reaches + its maximum + """ + def __init__(self): + self.range = 1000 # +/- value the randomly select number can be between + self.bounds = 2000 # Action space bounds + + self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds])) + self.observation_space = spaces.Discrete(4) + + self.number = 0 + self.guess_count = 0 + self.guess_max = 200 + self.observation = 0 + + self.seed() + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action) + + if action < self.number: + self.observation = 1 + + elif action == self.number: + self.observation = 2 + + elif action > self.number: + self.observation = 3 + + reward = ((min(action, self.number) + self.bounds) / (max(action, self.number) + self.bounds)) ** 2 + + self.guess_count += 1 + done = self.guess_count >= self.guess_max + + return self.observation, reward[0], done, {"number": self.number, "guesses": self.guess_count} + + def reset(self): + self.number = self.np_random.uniform(-self.range, self.range) + self.guess_count = 0 + self.observation = 0 + return self.observation diff --git a/src/gym/envs/toy_text/kellycoinflip.py b/src/gym/envs/toy_text/kellycoinflip.py new file mode 100644 index 0000000..c2a91fa --- /dev/null +++ b/src/gym/envs/toy_text/kellycoinflip.py @@ -0,0 +1,150 @@ +import gym +from gym import spaces +from gym.utils import seeding +from gym.spaces import prng +# for Generalized Kelly coinflip game distributions: +from scipy.stats import genpareto +import numpy as np +import numpy.random + +def flip(edge, np_random): + return np_random.uniform() < edge + +class KellyCoinflipEnv(gym.Env): + """The Kelly coinflip game is a simple gambling introduced by Haghani & Dewey 2016's 'Rational Decision-Making Under Uncertainty: Observed Betting Patterns on a Biased Coin' (https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2856963), to test human decision-making in a setting like that of the stock market: positive expected value but highly stochastic; they found many subjects performed badly, often going broke, even though optimal play would reach the maximum with ~95% probability. In the coinflip game, the player starts with $25.00 to gamble over 300 rounds; each round, they can bet anywhere up to their net worth (in penny increments), and then a coin is flipped; with P=0.6, the player wins twice what they bet, otherwise, they lose it. $250 is the maximum players are allowed to have. At the end of the 300 rounds, they keep whatever they have. The human subjects earned an average of $91; a simple use of the Kelly criterion (https://en.wikipedia.org/wiki/Kelly_criterion), giving a strategy of betting 20% until the cap is hit, would earn $240; a decision tree analysis shows that optimal play earns $246 (https://www.gwern.net/Coin-flip). The game short-circuits when either wealth = $0 (since one can never recover) or wealth = cap (trivial optimal play: one simply bets nothing thereafter). In this implementation, we default to the paper settings of $25, 60% odds, wealth cap of $250, and 300 rounds. To specify the action space in advance, we multiply the wealth cap (in dollars) by 100 (to allow for all penny bets); should one attempt to bet more money than one has, it is rounded down to one's net worth. (Alternately, a mistaken bet could end the episode immediately; it's not clear to me which version would be better.) For a harder version which randomizes the 3 key parameters, see the Generalized Kelly coinflip game.""" + metadata = {'render.modes': ['human']} + def __init__(self, initialWealth=25.0, edge=0.6, maxWealth=250.0, maxRounds=300): + + self.action_space = spaces.Discrete(int(maxWealth*100)) # betting in penny increments + self.observation_space = spaces.Tuple(( + spaces.Box(0, maxWealth, [1]), # (w,b) + spaces.Discrete(maxRounds+1))) + self.reward_range = (0, maxWealth) + self.edge = edge + self.wealth = initialWealth + self.initialWealth = initialWealth + self.maxRounds = maxRounds + self.maxWealth = maxWealth + self.seed() + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + action = action/100.0 # convert from pennies to dollars + if action > self.wealth: # treat attempts to bet more than possess as == betting everything + action = self.wealth + if self.wealth < 0.000001: + done = True + reward = 0.0 + else: + if self.rounds == 0: + done = True + reward = self.wealth + else: + self.rounds = self.rounds - 1 + done = False + reward = 0.0 + coinflip = flip(self.edge, self.np_random) + if coinflip: + self.wealth = min(self.maxWealth, self.wealth + action) + else: + self.wealth = self.wealth - action + return self._get_obs(), reward, done, {} + + def _get_obs(self): + return (np.array([self.wealth]), self.rounds) + + def reset(self): + self.rounds = self.maxRounds + self.wealth = self.initialWealth + return self._get_obs() + + def render(self, mode='human'): + print("Current wealth: ", self.wealth, "; Rounds left: ", self.rounds) + +class KellyCoinflipGeneralizedEnv(gym.Env): + """The Generalized Kelly coinflip game is an extension by ArthurB & Gwern Branwen which expands the Kelly coinflip game MDP into a POMDP, where the 3 key parameters (edge, maximum wealth, and number of rounds) are unknown random variables drawn from 3 distributions: a Beta(7,3) for the coinflip edge 0-1, a N(300,25) the total number of rounds, and a Pareto(5,200) for the wealth cap. These distributions are chosen to be conjugate & easily updatable, to allow for inference (other choices like the geometric for number of rounds wouldn't make observations informative), and to loosely reflect what a human might expect in the original Kelly coinflip game given that the number of rounds wasn't strictly fixed and they weren't told the wealth cap until they neared it. With these particular distributions, the entire history of the game can be summarized into a few sufficient statistics of rounds-elapsed/wins/losses/max-wealth-ever-reached, from which the Bayes-optimal decision can (in theory) be made; to avoid all agents having to tediously track those sufficient statistics manually in the same way, the observation space is augmented from wealth/rounds-left (rounds-left is deleted because it is a hidden variable) to current-wealth/rounds-elapsed/wins/losses/maximum-observed-wealth. The simple Kelly coinflip game can easily be solved by calculating decision trees, but the Generalized Kelly coinflip game may be intractable (although the analysis for the edge case alone suggests that the Bayes-optimal value may be very close to what one would calculate using a decision tree for any specific case), and represents a good challenge for RL agents.""" + metadata = {'render.modes': ['human']} + def __init__(self, initialWealth=25.0, edgePriorAlpha=7, edgePriorBeta=3, maxWealthAlpha=5.0, maxWealthM=200.0, maxRoundsMean=300.0, maxRoundsSD=25.0, reseed=True): + # store the hyperparameters for passing back into __init__() during resets so the same hyperparameters govern the next game's parameters, as the user expects: TODO: this is boilerplate, is there any more elegant way to do this? + self.initialWealth=float(initialWealth) + self.edgePriorAlpha=edgePriorAlpha + self.edgePriorBeta=edgePriorBeta + self.maxWealthAlpha=maxWealthAlpha + self.maxWealthM=maxWealthM + self.maxRoundsMean=maxRoundsMean + self.maxRoundsSD=maxRoundsSD + + # draw this game's set of parameters: + edge = prng.np_random.beta(edgePriorAlpha, edgePriorBeta) + maxWealth = round(genpareto.rvs(maxWealthAlpha, maxWealthM, random_state=prng.np_random)) + maxRounds = int(round(prng.np_random.normal(maxRoundsMean, maxRoundsSD))) + + # add an additional global variable which is the sufficient statistic for the Pareto distribution on wealth cap; + # alpha doesn't update, but x_m does, and simply is the highest wealth count we've seen to date: + self.maxEverWealth = float(self.initialWealth) + # for the coinflip edge, it is total wins/losses: + self.wins = 0 + self.losses = 0 + # for the number of rounds, we need to remember how many rounds we've played: + self.roundsElapsed = 0 + + # the rest proceeds as before: + self.action_space = spaces.Discrete(int(maxWealth*100)) + self.observation_space = spaces.Tuple(( + spaces.Box(0, maxWealth, shape=[1]), # current wealth + spaces.Discrete(maxRounds+1), # rounds elapsed + spaces.Discrete(maxRounds+1), # wins + spaces.Discrete(maxRounds+1), # losses + spaces.Box(0, maxWealth, [1]))) # maximum observed wealth + self.reward_range = (0, maxWealth) + self.edge = edge + self.wealth = self.initialWealth + self.maxRounds = maxRounds + self.rounds = self.maxRounds + self.maxWealth = maxWealth + if reseed or not hasattr(self, 'np_random') : self.seed() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + action = action/100.0 + if action > self.wealth: + action = self.wealth + if self.wealth < 0.000001: + done = True + reward = 0.0 + else: + if self.rounds == 0: + done = True + reward = self.wealth + else: + self.rounds = self.rounds - 1 + done = False + reward = 0.0 + coinflip = flip(self.edge, self.np_random) + self.roundsElapsed = self.roundsElapsed+1 + if coinflip: + self.wealth = min(self.maxWealth, self.wealth + action) + self.maxEverWealth = max(self.wealth, self.maxEverWealth) + self.wins = self.wins+1 + else: + self.wealth = self.wealth - action + self.losses = self.losses+1 + return self._get_obs(), reward, done, {} + + def _get_obs(self): + return (np.array([float(self.wealth)]), self.roundsElapsed, self.wins, self.losses, np.array([float(self.maxEverWealth)])) + def reset(self): + # re-init everything to draw new parameters etc, but preserve the RNG for reproducibility and pass in the same hyperparameters as originally specified: + self.__init__(initialWealth=self.initialWealth, edgePriorAlpha=self.edgePriorAlpha, edgePriorBeta=self.edgePriorBeta, maxWealthAlpha=self.maxWealthAlpha, maxWealthM=self.maxWealthM, maxRoundsMean=self.maxRoundsMean, maxRoundsSD=self.maxRoundsSD, reseed=False) + return self._get_obs() + def render(self, mode='human'): + print("Current wealth: ", self.wealth, "; Rounds left: ", self.rounds, "; True edge: ", self.edge, + "; True max wealth: ", self.maxWealth, "; True stopping time: ", self.maxRounds, "; Rounds left: ", + self.maxRounds - self.roundsElapsed) diff --git a/src/gym/envs/toy_text/nchain.py b/src/gym/envs/toy_text/nchain.py new file mode 100644 index 0000000..fcd077d --- /dev/null +++ b/src/gym/envs/toy_text/nchain.py @@ -0,0 +1,55 @@ +import gym +from gym import spaces +from gym.utils import seeding + +class NChainEnv(gym.Env): + """n-Chain environment + + This game presents moves along a linear chain of states, with two actions: + 0) forward, which moves along the chain but returns no reward + 1) backward, which returns to the beginning and has a small reward + + The end of the chain, however, presents a large reward, and by moving + 'forward' at the end of the chain this large reward can be repeated. + + At each action, there is a small probability that the agent 'slips' and the + opposite transition is instead taken. + + The observed state is the current state in the chain (0 to n-1). + + This environment is described in section 6.1 of: + A Bayesian Framework for Reinforcement Learning by Malcolm Strens (2000) + http://ceit.aut.ac.ir/~shiry/lecture/machine-learning/papers/BRL-2000.pdf + """ + def __init__(self, n=5, slip=0.2, small=2, large=10): + self.n = n + self.slip = slip # probability of 'slipping' an action + self.small = small # payout for 'backwards' action + self.large = large # payout at end of chain for 'forwards' action + self.state = 0 # Start at beginning of the chain + self.action_space = spaces.Discrete(2) + self.observation_space = spaces.Discrete(self.n) + self.seed() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action) + if self.np_random.rand() < self.slip: + action = not action # agent slipped, reverse action taken + if action: # 'backwards': go back to the beginning, get small reward + reward = self.small + self.state = 0 + elif self.state < self.n - 1: # 'forwards': go up along the chain + reward = 0 + self.state += 1 + else: # 'forwards': stay at the end of the chain, collect large reward + reward = self.large + done = False + return self.state, reward, done, {} + + def reset(self): + self.state = 0 + return self.state diff --git a/src/gym/envs/toy_text/roulette.py b/src/gym/envs/toy_text/roulette.py new file mode 100644 index 0000000..0006e7e --- /dev/null +++ b/src/gym/envs/toy_text/roulette.py @@ -0,0 +1,44 @@ +import gym +from gym import spaces +from gym.utils import seeding + + +class RouletteEnv(gym.Env): + """Simple roulette environment + + The roulette wheel has 37 spots. If the bet is 0 and a 0 comes up, + you win a reward of 35. If the parity of your bet matches the parity + of the spin, you win 1. Otherwise you receive a reward of -1. + + The long run reward for playing 0 should be -1/37 for any state + + The last action (38) stops the rollout for a return of 0 (walking away) + """ + def __init__(self, spots=37): + self.n = spots + 1 + self.action_space = spaces.Discrete(self.n) + self.observation_space = spaces.Discrete(1) + self.seed() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action) + if action == self.n - 1: + # observation, reward, done, info + return 0, 0, True, {} + + # N.B. np.random.randint draws from [A, B) while random.randint draws from [A,B] + val = self.np_random.randint(0, self.n - 1) + if val == action == 0: + reward = self.n - 2.0 + elif val != 0 and action != 0 and val % 2 == action % 2: + reward = 1.0 + else: + reward = -1.0 + return 0, reward, False, {} + + def reset(self): + return 0 diff --git a/src/gym/envs/toy_text/taxi.py b/src/gym/envs/toy_text/taxi.py new file mode 100644 index 0000000..270b33f --- /dev/null +++ b/src/gym/envs/toy_text/taxi.py @@ -0,0 +1,151 @@ +import sys +from six import StringIO +from gym import utils +from gym.envs.toy_text import discrete +import numpy as np + +MAP = [ + "+---------+", + "|R: | : :G|", + "| : : : : |", + "| : : : : |", + "| | : | : |", + "|Y| : |B: |", + "+---------+", +] + +class TaxiEnv(discrete.DiscreteEnv): + """ + The Taxi Problem + from "Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition" + by Tom Dietterich + + Description: + There are four designated locations in the grid world indicated by R(ed), B(lue), G(reen), and Y(ellow). When the episode starts, the taxi starts off at a random square and the passenger is at a random location. The taxi drive to the passenger's location, pick up the passenger, drive to the passenger's destination (another one of the four specified locations), and then drop off the passenger. Once the passenger is dropped off, the episode ends. + + Observations: + There are 500 discrete states since there are 25 taxi positions, 5 possible locations of the passenger (including the case when the passenger is the taxi), and 4 destination locations. + + Actions: + There are 6 discrete deterministic actions: + - 0: move south + - 1: move north + - 2: move east + - 3: move west + - 4: pickup passenger + - 5: dropoff passenger + + Rewards: + There is a reward of -1 for each action and an additional reward of +20 for delievering the passenger. There is a reward of -10 for executing actions "pickup" and "dropoff" illegally. + + + Rendering: + - blue: passenger + - magenta: destination + - yellow: empty taxi + - green: full taxi + - other letters: locations + + """ + metadata = {'render.modes': ['human', 'ansi']} + + def __init__(self): + self.desc = np.asarray(MAP,dtype='c') + + self.locs = locs = [(0,0), (0,4), (4,0), (4,3)] + + nS = 500 + nR = 5 + nC = 5 + maxR = nR-1 + maxC = nC-1 + isd = np.zeros(nS) + nA = 6 + P = {s : {a : [] for a in range(nA)} for s in range(nS)} + for row in range(5): + for col in range(5): + for passidx in range(5): + for destidx in range(4): + state = self.encode(row, col, passidx, destidx) + if passidx < 4 and passidx != destidx: + isd[state] += 1 + for a in range(nA): + # defaults + newrow, newcol, newpassidx = row, col, passidx + reward = -1 + done = False + taxiloc = (row, col) + + if a==0: + newrow = min(row+1, maxR) + elif a==1: + newrow = max(row-1, 0) + if a==2 and self.desc[1+row,2*col+2]==b":": + newcol = min(col+1, maxC) + elif a==3 and self.desc[1+row,2*col]==b":": + newcol = max(col-1, 0) + elif a==4: # pickup + if (passidx < 4 and taxiloc == locs[passidx]): + newpassidx = 4 + else: + reward = -10 + elif a==5: # dropoff + if (taxiloc == locs[destidx]) and passidx==4: + done = True + reward = 20 + elif (taxiloc in locs) and passidx==4: + newpassidx = locs.index(taxiloc) + else: + reward = -10 + newstate = self.encode(newrow, newcol, newpassidx, destidx) + P[state][a].append((1.0, newstate, reward, done)) + isd /= isd.sum() + discrete.DiscreteEnv.__init__(self, nS, nA, P, isd) + + def encode(self, taxirow, taxicol, passloc, destidx): + # (5) 5, 5, 4 + i = taxirow + i *= 5 + i += taxicol + i *= 5 + i += passloc + i *= 4 + i += destidx + return i + + def decode(self, i): + out = [] + out.append(i % 4) + i = i // 4 + out.append(i % 5) + i = i // 5 + out.append(i % 5) + i = i // 5 + out.append(i) + assert 0 <= i < 5 + return reversed(out) + + def render(self, mode='human'): + outfile = StringIO() if mode == 'ansi' else sys.stdout + + out = self.desc.copy().tolist() + out = [[c.decode('utf-8') for c in line] for line in out] + taxirow, taxicol, passidx, destidx = self.decode(self.s) + def ul(x): return "_" if x == " " else x + if passidx < 4: + out[1+taxirow][2*taxicol+1] = utils.colorize(out[1+taxirow][2*taxicol+1], 'yellow', highlight=True) + pi, pj = self.locs[passidx] + out[1+pi][2*pj+1] = utils.colorize(out[1+pi][2*pj+1], 'blue', bold=True) + else: # passenger in taxi + out[1+taxirow][2*taxicol+1] = utils.colorize(ul(out[1+taxirow][2*taxicol+1]), 'green', highlight=True) + + di, dj = self.locs[destidx] + out[1+di][2*dj+1] = utils.colorize(out[1+di][2*dj+1], 'magenta') + outfile.write("\n".join(["".join(row) for row in out])+"\n") + if self.lastaction is not None: + outfile.write(" ({})\n".format(["South", "North", "East", "West", "Pickup", "Dropoff"][self.lastaction])) + else: outfile.write("\n") + + # No need to return anything for human + if mode != 'human': + return outfile diff --git a/src/gym/envs/unittest/__init__.py b/src/gym/envs/unittest/__init__.py new file mode 100644 index 0000000..2c6008a --- /dev/null +++ b/src/gym/envs/unittest/__init__.py @@ -0,0 +1,5 @@ +from gym.envs.unittest.cube_crash import CubeCrash +from gym.envs.unittest.cube_crash import CubeCrashSparse +from gym.envs.unittest.cube_crash import CubeCrashScreenBecomesBlack +from gym.envs.unittest.memorize_digits import MemorizeDigits + diff --git a/src/gym/envs/unittest/cube_crash.py b/src/gym/envs/unittest/cube_crash.py new file mode 100644 index 0000000..6780085 --- /dev/null +++ b/src/gym/envs/unittest/cube_crash.py @@ -0,0 +1,149 @@ +import numpy as np +import gym +from gym import spaces +from gym.utils import seeding + +# Unit test environment for CNNs and CNN+RNN algorithms. +# Looks like this (RGB observations): +# +# --------------------------- +# | | +# | | +# | | +# | ** | +# | ** | +# | | +# | | +# | | +# | | +# | | +# ======== ============== +# +# Goal is to go through the hole at the bottom. Agent controls square using Left-Nop-Right actions. +# It falls down automatically, episode length is a bit less than FIELD_H +# +# CubeCrash-v0 # shaped reward +# CubeCrashSparse-v0 # reward 0 or 1 at the end +# CubeCrashScreenBecomesBlack-v0 # for RNNs +# +# To see how it works, run: +# +# python examples/agents/keyboard_agent.py CubeCrashScreen-v0 + +FIELD_W = 32 +FIELD_H = 40 +HOLE_WIDTH = 8 + +color_black = np.array((0,0,0)).astype('float32') +color_white = np.array((255,255,255)).astype('float32') +color_green = np.array((0,255,0)).astype('float32') + +class CubeCrash(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : 60, + 'video.res_w' : FIELD_W, + 'video.res_h' : FIELD_H, + } + + use_shaped_reward = True + use_black_screen = False + use_random_colors = False # Makes env too hard + + def __init__(self): + self.seed() + self.viewer = None + + self.observation_space = spaces.Box(0, 255, (FIELD_H,FIELD_W,3), dtype=np.uint8) + self.action_space = spaces.Discrete(3) + + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def random_color(self): + return np.array([ + self.np_random.randint(low=0, high=255), + self.np_random.randint(low=0, high=255), + self.np_random.randint(low=0, high=255), + ]).astype('uint8') + + def reset(self): + self.cube_x = self.np_random.randint(low=3, high=FIELD_W-3) + self.cube_y = self.np_random.randint(low=3, high=FIELD_H//6) + self.hole_x = self.np_random.randint(low=HOLE_WIDTH, high=FIELD_W-HOLE_WIDTH) + self.bg_color = self.random_color() if self.use_random_colors else color_black + self.potential = None + self.step_n = 0 + while 1: + self.wall_color = self.random_color() if self.use_random_colors else color_white + self.cube_color = self.random_color() if self.use_random_colors else color_green + if np.linalg.norm(self.wall_color - self.bg_color) < 50 or np.linalg.norm(self.cube_color - self.bg_color) < 50: continue + break + return self.step(0)[0] + + def step(self, action): + if action==0: pass + elif action==1: self.cube_x -= 1 + elif action==2: self.cube_x += 1 + else: assert 0, "Action %i is out of range" % action + self.cube_y += 1 + self.step_n += 1 + + obs = np.zeros( (FIELD_H,FIELD_W,3), dtype=np.uint8 ) + obs[:,:,:] = self.bg_color + obs[FIELD_H-5:FIELD_H,:,:] = self.wall_color + obs[FIELD_H-5:FIELD_H, self.hole_x-HOLE_WIDTH//2:self.hole_x+HOLE_WIDTH//2+1, :] = self.bg_color + obs[self.cube_y-1:self.cube_y+2, self.cube_x-1:self.cube_x+2, :] = self.cube_color + if self.use_black_screen and self.step_n > 4: + obs[:] = np.zeros((3,), dtype=np.uint8) + + done = False + reward = 0 + dist = np.abs(self.cube_x - self.hole_x) + if self.potential is not None and self.use_shaped_reward: + reward = (self.potential - dist) * 0.01 + self.potential = dist + + if self.cube_x-1 < 0 or self.cube_x+1 >= FIELD_W: + done = True + reward = -1 + elif self.cube_y+1 >= FIELD_H-5: + if dist >= HOLE_WIDTH//2: + done = True + reward = -1 + elif self.cube_y == FIELD_H: + done = True + reward = +1 + self.last_obs = obs + return obs, reward, done, {} + + def render(self, mode='human', close=False): + if close: + if self.viewer is not None: + self.viewer.close() + self.viewer = None + return + + if mode == 'rgb_array': + return self.last_obs + + elif mode == 'human': + from gym.envs.classic_control import rendering + if self.viewer is None: + self.viewer = rendering.SimpleImageViewer() + self.viewer.imshow(self.last_obs) + return self.viewer.isopen + + else: + assert 0, "Render mode '%s' is not supported" % mode + +class CubeCrashSparse(CubeCrash): + use_shaped_reward = False + +class CubeCrashScreenBecomesBlack(CubeCrash): + use_shaped_reward = False + use_black_screen = True + diff --git a/src/gym/envs/unittest/memorize_digits.py b/src/gym/envs/unittest/memorize_digits.py new file mode 100644 index 0000000..27df27e --- /dev/null +++ b/src/gym/envs/unittest/memorize_digits.py @@ -0,0 +1,195 @@ +import numpy as np +import gym +from gym import spaces +from gym.utils import seeding + +# Unit test environment for CNNs. +# Looks like this (RGB observations): +# +# --------------------------- +# | | +# | ****** | +# | ****** | +# | ** ** | +# | ** ** | +# | ** | +# | ** | +# | **** | +# | **** | +# | **** | +# | **** | +# | ********** | +# | ********** | +# | | +# --------------------------- +# +# Agent should hit action 2 to gain reward. Catches off-by-one errors in your agent. +# +# To see how it works, run: +# +# python examples/agents/keyboard_agent.py MemorizeDigits-v0 + +FIELD_W = 32 +FIELD_H = 24 + +bogus_mnist = \ +[[ +" **** ", +"* *", +"* *", +"* *", +"* *", +" **** " +], [ +" ** ", +" * * ", +" * ", +" * ", +" * ", +" *** " +], [ +" **** ", +"* *", +" *", +" *** ", +"** ", +"******" +], [ +" **** ", +"* *", +" ** ", +" *", +"* *", +" **** " +], [ +" * * ", +" * * ", +" * * ", +" **** ", +" * ", +" * " +], [ +" **** ", +" * ", +" **** ", +" * ", +" * ", +" **** " +], [ +" *** ", +" * ", +" **** ", +" * * ", +" * * ", +" **** " +], [ +" **** ", +" * ", +" * ", +" * ", +" * ", +" * " +], [ +" **** ", +"* *", +" **** ", +"* *", +"* *", +" **** " +], [ +" **** ", +"* *", +"* *", +" *****", +" *", +" **** " +]] + +color_black = np.array((0,0,0)).astype('float32') +color_white = np.array((255,255,255)).astype('float32') + +class MemorizeDigits(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : 60, + 'video.res_w' : FIELD_W, + 'video.res_h' : FIELD_H, + } + + use_random_colors = False + + def __init__(self): + self.seed() + self.viewer = None + self.observation_space = spaces.Box(0, 255, (FIELD_H,FIELD_W,3), dtype=np.uint8) + self.action_space = spaces.Discrete(10) + self.bogus_mnist = np.zeros( (10,6,6), dtype=np.uint8 ) + for digit in range(10): + for y in range(6): + self.bogus_mnist[digit,y,:] = [ord(char) for char in bogus_mnist[digit][y]] + self.reset() + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def random_color(self): + return np.array([ + self.np_random.randint(low=0, high=255), + self.np_random.randint(low=0, high=255), + self.np_random.randint(low=0, high=255), + ]).astype('uint8') + + def reset(self): + self.digit_x = self.np_random.randint(low=FIELD_W//5, high=FIELD_W//5*4) + self.digit_y = self.np_random.randint(low=FIELD_H//5, high=FIELD_H//5*4) + self.color_bg = self.random_color() if self.use_random_colors else color_black + self.step_n = 0 + while 1: + self.color_digit = self.random_color() if self.use_random_colors else color_white + if np.linalg.norm(self.color_digit - self.color_bg) < 50: continue + break + self.digit = -1 + return self.step(0)[0] + + def step(self, action): + reward = -1 + done = False + self.step_n += 1 + if self.digit==-1: + pass + else: + if self.digit==action: + reward = +1 + done = self.step_n > 20 and 0==self.np_random.randint(low=0, high=5) + self.digit = self.np_random.randint(low=0, high=10) + obs = np.zeros( (FIELD_H,FIELD_W,3), dtype=np.uint8 ) + obs[:,:,:] = self.color_bg + digit_img = np.zeros( (6,6,3), dtype=np.uint8 ) + digit_img[:] = self.color_bg + xxx = self.bogus_mnist[self.digit]==42 + digit_img[xxx] = self.color_digit + obs[self.digit_y-3:self.digit_y+3, self.digit_x-3:self.digit_x+3] = digit_img + self.last_obs = obs + return obs, reward, done, {} + + def render(self, mode='human', close=False): + if close: + if self.viewer is not None: + self.viewer.close() + self.viewer = None + return + + if mode == 'rgb_array': + return self.last_obs + + elif mode == 'human': + from gym.envs.classic_control import rendering + if self.viewer is None: + self.viewer = rendering.SimpleImageViewer() + self.viewer.imshow(self.last_obs) + return self.viewer.isopen + + else: + assert 0, "Render mode '%s' is not supported" % mode + diff --git a/src/gym/error.py b/src/gym/error.py new file mode 100644 index 0000000..08bfecb --- /dev/null +++ b/src/gym/error.py @@ -0,0 +1,139 @@ +import sys + +class Error(Exception): + pass + +# Local errors + +class Unregistered(Error): + """Raised when the user requests an item from the registry that does + not actually exist. + """ + pass + +class UnregisteredEnv(Unregistered): + """Raised when the user requests an env from the registry that does + not actually exist. + """ + pass + +class UnregisteredBenchmark(Unregistered): + """Raised when the user requests an env from the registry that does + not actually exist. + """ + pass + +class DeprecatedEnv(Error): + """Raised when the user requests an env from the registry with an + older version number than the latest env with the same name. + """ + pass + +class UnseedableEnv(Error): + """Raised when the user tries to seed an env that does not support + seeding. + """ + pass + +class DependencyNotInstalled(Error): + pass + +class UnsupportedMode(Exception): + """Raised when the user requests a rendering mode not supported by the + environment. + """ + pass + +class ResetNeeded(Exception): + """When the monitor is active, raised when the user tries to step an + environment that's already done. + """ + pass + +class ResetNotAllowed(Exception): + """When the monitor is active, raised when the user tries to step an + environment that's not yet done. + """ + pass + +class InvalidAction(Exception): + """Raised when the user performs an action not contained within the + action space + """ + pass + +# API errors + +class APIError(Error): + def __init__(self, message=None, http_body=None, http_status=None, + json_body=None, headers=None): + super(APIError, self).__init__(message) + + if http_body and hasattr(http_body, 'decode'): + try: + http_body = http_body.decode('utf-8') + except: + http_body = ('') + + self._message = message + self.http_body = http_body + self.http_status = http_status + self.json_body = json_body + self.headers = headers or {} + self.request_id = self.headers.get('request-id', None) + + def __unicode__(self): + if self.request_id is not None: + msg = self._message or "" + return u"Request {0}: {1}".format(self.request_id, msg) + else: + return self._message + + def __str__(self): + try: # Python 2 + return unicode(self).encode('utf-8') + except NameError: # Python 3 + return self.__unicode__() + + +class APIConnectionError(APIError): + pass + + +class InvalidRequestError(APIError): + + def __init__(self, message, param, http_body=None, + http_status=None, json_body=None, headers=None): + super(InvalidRequestError, self).__init__( + message, http_body, http_status, json_body, + headers) + self.param = param + + +class AuthenticationError(APIError): + pass + +class RateLimitError(APIError): + pass + +# Video errors + +class VideoRecorderError(Error): + pass + +class InvalidFrame(Error): + pass + +# Wrapper errors + +class DoubleWrapperError(Error): + pass + + +class WrapAfterConfigureError(Error): + pass + + +class RetriesExceededError(Error): + pass diff --git a/src/gym/logger.py b/src/gym/logger.py new file mode 100644 index 0000000..24c365c --- /dev/null +++ b/src/gym/logger.py @@ -0,0 +1,35 @@ +from gym.utils import colorize + +DEBUG = 10 +INFO = 20 +WARN = 30 +ERROR = 40 +DISABLED = 50 + +MIN_LEVEL = 30 + +def set_level(level): + """ + Set logging threshold on current logger. + """ + global MIN_LEVEL + MIN_LEVEL = level + +def debug(msg, *args): + if MIN_LEVEL <= DEBUG: + print('%s: %s'%('DEBUG', msg % args)) + +def info(msg, *args): + if MIN_LEVEL <= INFO: + print('%s: %s'%('INFO', msg % args)) + +def warn(msg, *args): + if MIN_LEVEL <= WARN: + print(colorize('%s: %s'%('WARN', msg % args), 'yellow')) + +def error(msg, *args): + if MIN_LEVEL <= ERROR: + print(colorize('%s: %s'%('ERROR', msg % args), 'red')) + +# DEPRECATED: +setLevel = set_level diff --git a/src/gym/spaces/__init__.py b/src/gym/spaces/__init__.py new file mode 100644 index 0000000..4eb21f6 --- /dev/null +++ b/src/gym/spaces/__init__.py @@ -0,0 +1,9 @@ +from gym.spaces.box import Box +from gym.spaces.discrete import Discrete +from gym.spaces.multi_discrete import MultiDiscrete +from gym.spaces.multi_binary import MultiBinary +from gym.spaces.prng import seed, np_random +from gym.spaces.tuple_space import Tuple +from gym.spaces.dict_space import Dict + +__all__ = ["Box", "Discrete", "MultiDiscrete", "MultiBinary", "Tuple", "Dict"] diff --git a/src/gym/spaces/__pycache__/__init__.cpython-37.pyc b/src/gym/spaces/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..42835ca Binary files /dev/null and b/src/gym/spaces/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/box.cpython-37.pyc b/src/gym/spaces/__pycache__/box.cpython-37.pyc new file mode 100644 index 0000000..94ecf17 Binary files /dev/null and b/src/gym/spaces/__pycache__/box.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/dict_space.cpython-37.pyc b/src/gym/spaces/__pycache__/dict_space.cpython-37.pyc new file mode 100644 index 0000000..9218651 Binary files /dev/null and b/src/gym/spaces/__pycache__/dict_space.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/discrete.cpython-37.pyc b/src/gym/spaces/__pycache__/discrete.cpython-37.pyc new file mode 100644 index 0000000..c1e5f4c Binary files /dev/null and b/src/gym/spaces/__pycache__/discrete.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/multi_binary.cpython-37.pyc b/src/gym/spaces/__pycache__/multi_binary.cpython-37.pyc new file mode 100644 index 0000000..2e6c488 Binary files /dev/null and b/src/gym/spaces/__pycache__/multi_binary.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/multi_discrete.cpython-37.pyc b/src/gym/spaces/__pycache__/multi_discrete.cpython-37.pyc new file mode 100644 index 0000000..a5c3bb6 Binary files /dev/null and b/src/gym/spaces/__pycache__/multi_discrete.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/prng.cpython-37.pyc b/src/gym/spaces/__pycache__/prng.cpython-37.pyc new file mode 100644 index 0000000..88af76d Binary files /dev/null and b/src/gym/spaces/__pycache__/prng.cpython-37.pyc differ diff --git a/src/gym/spaces/__pycache__/tuple_space.cpython-37.pyc b/src/gym/spaces/__pycache__/tuple_space.cpython-37.pyc new file mode 100644 index 0000000..abee962 Binary files /dev/null and b/src/gym/spaces/__pycache__/tuple_space.cpython-37.pyc differ diff --git a/src/gym/spaces/box.py b/src/gym/spaces/box.py new file mode 100644 index 0000000..d0d41f2 --- /dev/null +++ b/src/gym/spaces/box.py @@ -0,0 +1,52 @@ +import numpy as np +import gym +from gym import logger + +class Box(gym.Space): + """ + A box in R^n. + I.e., each coordinate is bounded. + + Example usage: + self.action_space = spaces.Box(low=-10, high=10, shape=(1,)) + """ + def __init__(self, low=None, high=None, shape=None, dtype=None): + """ + Two kinds of valid input: + Box(low=-1.0, high=1.0, shape=(3,4)) # low and high are scalars, and shape is provided + Box(low=np.array([-1.0,-2.0]), high=np.array([2.0,4.0])) # low and high are arrays of the same shape + """ + if shape is None: + assert low.shape == high.shape + shape = low.shape + else: + assert np.isscalar(low) and np.isscalar(high) + low = low + np.zeros(shape) + high = high + np.zeros(shape) + if dtype is None: # Autodetect type + if (high == 255).all(): + dtype = np.uint8 + else: + dtype = np.float32 + logger.warn("gym.spaces.Box autodetected dtype as %s. Please provide explicit dtype." % dtype) + self.low = low.astype(dtype) + self.high = high.astype(dtype) + gym.Space.__init__(self, shape, dtype) + + def sample(self): + return gym.spaces.np_random.uniform(low=self.low, high=self.high + (0 if self.dtype.kind == 'f' else 1), size=self.low.shape).astype(self.dtype) + + def contains(self, x): + return x.shape == self.shape and (x >= self.low).all() and (x <= self.high).all() + + def to_jsonable(self, sample_n): + return np.array(sample_n).tolist() + + def from_jsonable(self, sample_n): + return [np.asarray(sample) for sample in sample_n] + + def __repr__(self): + return "Box" + str(self.shape) + + def __eq__(self, other): + return np.allclose(self.low, other.low) and np.allclose(self.high, other.high) diff --git a/src/gym/spaces/dict_space.py b/src/gym/spaces/dict_space.py new file mode 100644 index 0000000..56cb91a --- /dev/null +++ b/src/gym/spaces/dict_space.py @@ -0,0 +1,75 @@ +import gym +from collections import OrderedDict + +class Dict(gym.Space): + """ + A dictionary of simpler spaces. + + Example usage: + self.observation_space = spaces.Dict({"position": spaces.Discrete(2), "velocity": spaces.Discrete(3)}) + + Example usage [nested]: + self.nested_observation_space = spaces.Dict({ + 'sensors': spaces.Dict({ + 'position': spaces.Box(low=-100, high=100, shape=(3,)), + 'velocity': spaces.Box(low=-1, high=1, shape=(3,)), + 'front_cam': spaces.Tuple(( + spaces.Box(low=0, high=1, shape=(10, 10, 3)), + spaces.Box(low=0, high=1, shape=(10, 10, 3)) + )), + 'rear_cam': spaces.Box(low=0, high=1, shape=(10, 10, 3)), + }), + 'ext_controller': spaces.MultiDiscrete([ [0,4], [0,1], [0,1] ]), + 'inner_state':spaces.Dict({ + 'charge': spaces.Discrete(100), + 'system_checks': spaces.MultiBinary(10), + 'job_status': spaces.Dict({ + 'task': spaces.Discrete(5), + 'progress': spaces.Box(low=0, high=100, shape=()), + }) + }) + }) + """ + def __init__(self, spaces): + if isinstance(spaces, dict) and not isinstance(spaces, OrderedDict): + spaces = OrderedDict(sorted(list(spaces.items()))) + if isinstance(spaces, list): + spaces = OrderedDict(spaces) + self.spaces = spaces + gym.Space.__init__(self, None, None) # None for shape and dtype, since it'll require special handling + + def sample(self): + return OrderedDict([(k, space.sample()) for k, space in self.spaces.items()]) + + def contains(self, x): + if not isinstance(x, dict) or len(x) != len(self.spaces): + return False + for k, space in self.spaces.items(): + if k not in x: + return False + if not space.contains(x[k]): + return False + return True + + def __repr__(self): + return "Dict(" + ", ". join([k + ":" + str(s) for k, s in self.spaces.items()]) + ")" + + def to_jsonable(self, sample_n): + # serialize as dict-repr of vectors + return {key: space.to_jsonable([sample[key] for sample in sample_n]) \ + for key, space in self.spaces.items()} + + def from_jsonable(self, sample_n): + dict_of_list = {} + for key, space in self.spaces.items(): + dict_of_list[key] = space.from_jsonable(sample_n[key]) + ret = [] + for i, _ in enumerate(dict_of_list[key]): + entry = {} + for key, value in dict_of_list.items(): + entry[key] = value[i] + ret.append(entry) + return ret + + def __eq__(self, other): + return self.spaces == other.spaces diff --git a/src/gym/spaces/discrete.py b/src/gym/spaces/discrete.py new file mode 100644 index 0000000..c737640 --- /dev/null +++ b/src/gym/spaces/discrete.py @@ -0,0 +1,31 @@ +import numpy as np +import gym + +class Discrete(gym.Space): + """ + {0,1,...,n-1} + + Example usage: + self.observation_space = spaces.Discrete(2) + """ + def __init__(self, n): + self.n = n + gym.Space.__init__(self, (), np.int64) + + def sample(self): + return gym.spaces.np_random.randint(self.n) + + def contains(self, x): + if isinstance(x, int): + as_int = x + elif isinstance(x, (np.generic, np.ndarray)) and (x.dtype.kind in np.typecodes['AllInteger'] and x.shape == ()): + as_int = int(x) + else: + return False + return as_int >= 0 and as_int < self.n + + def __repr__(self): + return "Discrete(%d)" % self.n + + def __eq__(self, other): + return self.n == other.n diff --git a/src/gym/spaces/multi_binary.py b/src/gym/spaces/multi_binary.py new file mode 100644 index 0000000..cfa3364 --- /dev/null +++ b/src/gym/spaces/multi_binary.py @@ -0,0 +1,25 @@ +import gym +import numpy as np + +class MultiBinary(gym.Space): + def __init__(self, n): + self.n = n + gym.Space.__init__(self, (self.n,), np.int8) + + def sample(self): + return gym.spaces.np_random.randint(low=0, high=2, size=self.n).astype(self.dtype) + + def contains(self, x): + return ((x==0) | (x==1)).all() + + def to_jsonable(self, sample_n): + return np.array(sample_n).tolist() + + def from_jsonable(self, sample_n): + return [np.asarray(sample) for sample in sample_n] + + def __repr__(self): + return "MultiBinary({})".format(self.n) + + def __eq__(self, other): + return self.n == other.n diff --git a/src/gym/spaces/multi_discrete.py b/src/gym/spaces/multi_discrete.py new file mode 100644 index 0000000..ef92131 --- /dev/null +++ b/src/gym/spaces/multi_discrete.py @@ -0,0 +1,28 @@ +import gym +import numpy as np + +class MultiDiscrete(gym.Space): + def __init__(self, nvec): + """ + nvec: vector of counts of each categorical variable + """ + self.nvec = np.asarray(nvec, dtype=np.int32) + gym.Space.__init__(self, (self.nvec.shape,), np.int8) + + def sample(self): + return (gym.spaces.np_random.random_sample(self.nvec.shape) * self.nvec).astype(self.dtype) + + def contains(self, x): + return (0 <= x).all() and (x < self.nvec).all() and x.dtype.kind in 'ui' + + def to_jsonable(self, sample_n): + return [sample.tolist() for sample in sample_n] + + def from_jsonable(self, sample_n): + return np.array(sample_n) + + def __repr__(self): + return "MultiDiscrete({})".format(self.nvec) + + def __eq__(self, other): + return np.all(self.nvec == other.nvec) diff --git a/src/gym/spaces/prng.py b/src/gym/spaces/prng.py new file mode 100644 index 0000000..ffca680 --- /dev/null +++ b/src/gym/spaces/prng.py @@ -0,0 +1,20 @@ +import numpy + +np_random = numpy.random.RandomState() + +def seed(seed=None): + """Seed the common numpy.random.RandomState used in spaces + + CF + https://github.com/openai/gym/commit/58e6aa95e5af2c738557431f812abb81c505a7cf#commitcomment-17669277 + for some details about why we seed the spaces separately from the + envs, but tl;dr is that it's pretty uncommon for them to be used + within an actual algorithm, and the code becomes simpler to just + use this common numpy.random.RandomState. + """ + np_random.seed(seed) + +# This numpy.random.RandomState gets used in all spaces for their +# 'sample' method. It's not really expected that people will be using +# these in their algorithms. +seed(0) diff --git a/src/gym/spaces/tests/__init__.py b/src/gym/spaces/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/spaces/tests/test_spaces.py b/src/gym/spaces/tests/test_spaces.py new file mode 100644 index 0000000..cadc86f --- /dev/null +++ b/src/gym/spaces/tests/test_spaces.py @@ -0,0 +1,66 @@ +import json # note: ujson fails this test due to float equality +from copy import copy + +import numpy as np +import pytest + +from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict + + +@pytest.mark.parametrize("space", [ + Discrete(3), + Tuple([Discrete(5), Discrete(10)]), + Tuple([Discrete(5), Box(low=np.array([0, 0]),high=np.array([1, 5]))]), + Tuple((Discrete(5), Discrete(2), Discrete(2))), + MultiDiscrete([2, 2, 100]), + Dict({"position": Discrete(5), "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]))}), + ]) +def test_roundtripping(space): + sample_1 = space.sample() + sample_2 = space.sample() + assert space.contains(sample_1) + assert space.contains(sample_2) + json_rep = space.to_jsonable([sample_1, sample_2]) + + json_roundtripped = json.loads(json.dumps(json_rep)) + + samples_after_roundtrip = space.from_jsonable(json_roundtripped) + sample_1_prime, sample_2_prime = samples_after_roundtrip + + s1 = space.to_jsonable([sample_1]) + s1p = space.to_jsonable([sample_1_prime]) + s2 = space.to_jsonable([sample_2]) + s2p = space.to_jsonable([sample_2_prime]) + assert s1 == s1p, "Expected {} to equal {}".format(s1, s1p) + assert s2 == s2p, "Expected {} to equal {}".format(s2, s2p) + + +@pytest.mark.parametrize("space", [ + Discrete(3), + Box(low=np.array([-10, 0]),high=np.array([10, 10])), + Tuple([Discrete(5), Discrete(10)]), + Tuple([Discrete(5), Box(low=np.array([0, 0]),high=np.array([1, 5]))]), + Tuple((Discrete(5), Discrete(2), Discrete(2))), + MultiDiscrete([2, 2, 100]), + MultiBinary(6), + Dict({"position": Discrete(5), "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]))}), + ]) +def test_equality(space): + space1 = space + space2 = copy(space) + assert space1 == space2, "Expected {} to equal {}".format(space1, space2) + + +@pytest.mark.parametrize("spaces", [ + (Discrete(3), Discrete(4)), + (MultiDiscrete([2, 2, 100]), MultiDiscrete([2, 2, 8])), + (MultiBinary(8), MultiBinary(7)), + (Box(low=np.array([-10, 0]),high=np.array([10, 10])), + Box(low=np.array([-10, 0]),high=np.array([10, 9]))), + (Tuple([Discrete(5), Discrete(10)]), Tuple([Discrete(1), Discrete(10)])), + (Dict({"position": Discrete(5)}), Dict({"position": Discrete(4)})), + (Dict({"position": Discrete(5)}), Dict({"speed": Discrete(5)})), + ]) +def test_inequality(spaces): + space1, space2 = spaces + assert space1 != space2, "Expected {} != {}".format(space1, space2) diff --git a/src/gym/spaces/tuple_space.py b/src/gym/spaces/tuple_space.py new file mode 100644 index 0000000..473aa65 --- /dev/null +++ b/src/gym/spaces/tuple_space.py @@ -0,0 +1,35 @@ +import gym + +class Tuple(gym.Space): + """ + A tuple (i.e., product) of simpler spaces + + Example usage: + self.observation_space = spaces.Tuple((spaces.Discrete(2), spaces.Discrete(3))) + """ + def __init__(self, spaces): + self.spaces = spaces + gym.Space.__init__(self, None, None) + + def sample(self): + return tuple([space.sample() for space in self.spaces]) + + def contains(self, x): + if isinstance(x, list): + x = tuple(x) # Promote list to tuple for contains check + return isinstance(x, tuple) and len(x) == len(self.spaces) and all( + space.contains(part) for (space,part) in zip(self.spaces,x)) + + def __repr__(self): + return "Tuple(" + ", ". join([str(s) for s in self.spaces]) + ")" + + def to_jsonable(self, sample_n): + # serialize as list-repr of tuple of vectors + return [space.to_jsonable([sample[i] for sample in sample_n]) \ + for i, space in enumerate(self.spaces)] + + def from_jsonable(self, sample_n): + return [sample for sample in zip(*[space.from_jsonable(sample_n[i]) for i, space in enumerate(self.spaces)])] + + def __eq__(self, other): + return self.spaces == other.spaces diff --git a/src/gym/tests/test_core.py b/src/gym/tests/test_core.py new file mode 100644 index 0000000..7256818 --- /dev/null +++ b/src/gym/tests/test_core.py @@ -0,0 +1,15 @@ +from gym import core + +class ArgumentEnv(core.Env): + calls = 0 + + def __init__(self, arg): + self.calls += 1 + self.arg = arg + +def test_env_instantiation(): + # This looks like a pretty trivial, but given our usage of + # __new__, it's worth having. + env = ArgumentEnv('arg') + assert env.arg == 'arg' + assert env.calls == 1 diff --git a/src/gym/utils/__init__.py b/src/gym/utils/__init__.py new file mode 100644 index 0000000..6d6aa82 --- /dev/null +++ b/src/gym/utils/__init__.py @@ -0,0 +1,10 @@ +"""A set of common utilities used within the environments. These are +not intended as API functions, and will not remain stable over time. +""" + +# These submodules should not have any import-time dependencies. +# We want this since we use `utils` during our import-time sanity checks +# that verify that our dependencies are actually present. +from .colorize import colorize +from .ezpickle import EzPickle +from .reraise import reraise diff --git a/src/gym/utils/__pycache__/__init__.cpython-37.pyc b/src/gym/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..2c109f1 Binary files /dev/null and b/src/gym/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/atomic_write.cpython-37.pyc b/src/gym/utils/__pycache__/atomic_write.cpython-37.pyc new file mode 100644 index 0000000..049e0bc Binary files /dev/null and b/src/gym/utils/__pycache__/atomic_write.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/closer.cpython-37.pyc b/src/gym/utils/__pycache__/closer.cpython-37.pyc new file mode 100644 index 0000000..4998a83 Binary files /dev/null and b/src/gym/utils/__pycache__/closer.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/colorize.cpython-37.pyc b/src/gym/utils/__pycache__/colorize.cpython-37.pyc new file mode 100644 index 0000000..9b40383 Binary files /dev/null and b/src/gym/utils/__pycache__/colorize.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/ezpickle.cpython-37.pyc b/src/gym/utils/__pycache__/ezpickle.cpython-37.pyc new file mode 100644 index 0000000..d3bbffc Binary files /dev/null and b/src/gym/utils/__pycache__/ezpickle.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/json_utils.cpython-37.pyc b/src/gym/utils/__pycache__/json_utils.cpython-37.pyc new file mode 100644 index 0000000..5d51ab2 Binary files /dev/null and b/src/gym/utils/__pycache__/json_utils.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/reraise.cpython-37.pyc b/src/gym/utils/__pycache__/reraise.cpython-37.pyc new file mode 100644 index 0000000..bd9f430 Binary files /dev/null and b/src/gym/utils/__pycache__/reraise.cpython-37.pyc differ diff --git a/src/gym/utils/__pycache__/reraise_impl_py3.cpython-37.pyc b/src/gym/utils/__pycache__/reraise_impl_py3.cpython-37.pyc new file mode 100644 index 0000000..40e2208 Binary files /dev/null and b/src/gym/utils/__pycache__/reraise_impl_py3.cpython-37.pyc differ diff --git a/src/gym/utils/atomic_write.py b/src/gym/utils/atomic_write.py new file mode 100644 index 0000000..adb07f6 --- /dev/null +++ b/src/gym/utils/atomic_write.py @@ -0,0 +1,55 @@ +# Based on http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python + +import os +from contextlib import contextmanager + +# We would ideally atomically replace any existing file with the new +# version. However, on Windows there's no Python-only solution prior +# to Python 3.3. (This library includes a C extension to do so: +# https://pypi.python.org/pypi/pyosreplace/0.1.) +# +# Correspondingly, we make a best effort, but on Python < 3.3 use a +# replace method which could result in the file temporarily +# disappearing. +import sys +if sys.version_info >= (3, 3): + # Python 3.3 and up have a native `replace` method + from os import replace +elif sys.platform.startswith("win"): + def replace(src, dst): + # TODO: on Windows, this will raise if the file is in use, + # which is possible. We'll need to make this more robust over + # time. + try: + os.remove(dst) + except OSError: + pass + os.rename(src, dst) +else: + # POSIX rename() is always atomic + from os import rename as replace + +@contextmanager +def atomic_write(filepath, binary=False, fsync=False): + """ Writeable file object that atomically updates a file (using a temporary file). In some cases (namely Python < 3.3 on Windows), this could result in an existing file being temporarily unlinked. + + :param filepath: the file path to be opened + :param binary: whether to open the file in a binary mode instead of textual + :param fsync: whether to force write the file to disk + """ + + tmppath = filepath + '~' + while os.path.isfile(tmppath): + tmppath += '~' + try: + with open(tmppath, 'wb' if binary else 'w') as file: + yield file + if fsync: + file.flush() + os.fsync(file.fileno()) + replace(tmppath, filepath) + finally: + try: + os.remove(tmppath) + except (IOError, OSError): + pass diff --git a/src/gym/utils/closer.py b/src/gym/utils/closer.py new file mode 100644 index 0000000..a8e5a5f --- /dev/null +++ b/src/gym/utils/closer.py @@ -0,0 +1,67 @@ +import atexit +import threading +import weakref + +class Closer(object): + """A registry that ensures your objects get closed, whether manually, + upon garbage collection, or upon exit. To work properly, your + objects need to cooperate and do something like the following: + + ``` + closer = Closer() + class Example(object): + def __init__(self): + self._id = closer.register(self) + + def close(self): + # Probably worth making idempotent too! + ... + closer.unregister(self._id) + + def __del__(self): + self.close() + ``` + + That is, your objects should: + + - register() themselves and save the returned ID + - unregister() themselves upon close() + - include a __del__ method which close()'s the object + """ + + def __init__(self, atexit_register=True): + self.lock = threading.Lock() + self.next_id = -1 + self.closeables = weakref.WeakValueDictionary() + + if atexit_register: + atexit.register(self.close) + + def generate_next_id(self): + with self.lock: + self.next_id += 1 + return self.next_id + + def register(self, closeable): + """Registers an object with a 'close' method. + + Returns: + int: The registration ID of this object. It is the caller's responsibility to save this ID if early closing is desired. + """ + assert hasattr(closeable, 'close'), 'No close method for {}'.format(closeable) + + next_id = self.generate_next_id() + self.closeables[next_id] = closeable + return next_id + + def unregister(self, id): + assert id is not None + if id in self.closeables: + del self.closeables[id] + + def close(self): + # Explicitly fetch all monitors first so that they can't disappear while + # we iterate. cf. http://stackoverflow.com/a/12429620 + closeables = list(self.closeables.values()) + for closeable in closeables: + closeable.close() diff --git a/src/gym/utils/colorize.py b/src/gym/utils/colorize.py new file mode 100644 index 0000000..da70184 --- /dev/null +++ b/src/gym/utils/colorize.py @@ -0,0 +1,35 @@ +"""A set of common utilities used within the environments. These are +not intended as API functions, and will not remain stable over time. +""" + +color2num = dict( + gray=30, + red=31, + green=32, + yellow=33, + blue=34, + magenta=35, + cyan=36, + white=37, + crimson=38 +) + + +def colorize(string, color, bold=False, highlight = False): + """Return string surrounded by appropriate terminal color codes to + print colorized text. Valid colors: gray, red, green, yellow, + blue, magenta, cyan, white, crimson + """ + + # Import six here so that `utils` has no import-time dependencies. + # We want this since we use `utils` during our import-time sanity checks + # that verify that our dependencies (including six) are actually present. + import six + + attr = [] + num = color2num[color] + if highlight: num += 10 + attr.append(six.u(str(num))) + if bold: attr.append(six.u('1')) + attrs = six.u(';').join(attr) + return six.u('\x1b[%sm%s\x1b[0m') % (attrs, string) diff --git a/src/gym/utils/ezpickle.py b/src/gym/utils/ezpickle.py new file mode 100644 index 0000000..3fb00da --- /dev/null +++ b/src/gym/utils/ezpickle.py @@ -0,0 +1,27 @@ +class EzPickle(object): + """Objects that are pickled and unpickled via their constructor + arguments. + + Example usage: + + class Dog(Animal, EzPickle): + def __init__(self, furcolor, tailkind="bushy"): + Animal.__init__() + EzPickle.__init__(furcolor, tailkind) + ... + + When this object is unpickled, a new Dog will be constructed by passing the provided + furcolor and tailkind into the constructor. However, philosophers are still not sure + whether it is still the same dog. + + This is generally needed only for environments which wrap C/C++ code, such as MuJoCo + and Atari. + """ + def __init__(self, *args, **kwargs): + self._ezpickle_args = args + self._ezpickle_kwargs = kwargs + def __getstate__(self): + return {"_ezpickle_args" : self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs} + def __setstate__(self, d): + out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"]) + self.__dict__.update(out.__dict__) diff --git a/src/gym/utils/json_utils.py b/src/gym/utils/json_utils.py new file mode 100644 index 0000000..4657dfc --- /dev/null +++ b/src/gym/utils/json_utils.py @@ -0,0 +1,22 @@ +import numpy as np + +def json_encode_np(obj): + """ + JSON can't serialize numpy types, so convert to pure python + """ + if isinstance(obj, np.ndarray): + return list(obj) + elif isinstance(obj, np.float32): + return float(obj) + elif isinstance(obj, np.float64): + return float(obj) + elif isinstance(obj, np.int8): + return int(obj) + elif isinstance(obj, np.int16): + return int(obj) + elif isinstance(obj, np.int32): + return int(obj) + elif isinstance(obj, np.int64): + return int(obj) + else: + return obj diff --git a/src/gym/utils/play.py b/src/gym/utils/play.py new file mode 100644 index 0000000..694089e --- /dev/null +++ b/src/gym/utils/play.py @@ -0,0 +1,186 @@ +import gym +import pygame +import sys +import time +import matplotlib +try: + matplotlib.use('GTK3Agg') + import matplotlib.pyplot as plt +except Exception: + pass + + +import pyglet.window as pw + +from collections import deque +from pygame.locals import HWSURFACE, DOUBLEBUF, RESIZABLE, VIDEORESIZE +from threading import Thread + +def display_arr(screen, arr, video_size, transpose): + arr_min, arr_max = arr.min(), arr.max() + arr = 255.0 * (arr - arr_min) / (arr_max - arr_min) + pyg_img = pygame.surfarray.make_surface(arr.swapaxes(0, 1) if transpose else arr) + pyg_img = pygame.transform.scale(pyg_img, video_size) + screen.blit(pyg_img, (0,0)) + +def play(env, transpose=True, fps=30, zoom=None, callback=None, keys_to_action=None): + """Allows one to play the game using keyboard. + + To simply play the game use: + + play(gym.make("Pong-v3")) + + Above code works also if env is wrapped, so it's particularly useful in + verifying that the frame-level preprocessing does not render the game + unplayable. + + If you wish to plot real time statistics as you play, you can use + gym.utils.play.PlayPlot. Here's a sample code for plotting the reward + for last 5 second of gameplay. + + def callback(obs_t, obs_tp1, rew, done, info): + return [rew,] + env_plotter = EnvPlotter(callback, 30 * 5, ["reward"]) + + env = gym.make("Pong-v3") + play(env, callback=env_plotter.callback) + + + Arguments + --------- + env: gym.Env + Environment to use for playing. + transpose: bool + If True the output of observation is transposed. + Defaults to true. + fps: int + Maximum number of steps of the environment to execute every second. + Defaults to 30. + zoom: float + Make screen edge this many times bigger + callback: lambda or None + Callback if a callback is provided it will be executed after + every step. It takes the following input: + obs_t: observation before performing action + obs_tp1: observation after performing action + action: action that was executed + rew: reward that was received + done: whether the environment is done or not + info: debug info + keys_to_action: dict: tuple(int) -> int or None + Mapping from keys pressed to action performed. + For example if pressed 'w' and space at the same time is supposed + to trigger action number 2 then key_to_action dict would look like this: + + { + # ... + sorted(ord('w'), ord(' ')) -> 2 + # ... + } + If None, default key_to_action mapping for that env is used, if provided. + """ + + obs_s = env.observation_space + assert type(obs_s) == gym.spaces.box.Box + assert len(obs_s.shape) == 2 or (len(obs_s.shape) == 3 and obs_s.shape[2] in [1,3]) + + if keys_to_action is None: + if hasattr(env, 'get_keys_to_action'): + keys_to_action = env.get_keys_to_action() + elif hasattr(env.unwrapped, 'get_keys_to_action'): + keys_to_action = env.unwrapped.get_keys_to_action() + else: + assert False, env.spec.id + " does not have explicit key to action mapping, " + \ + "please specify one manually" + relevant_keys = set(sum(map(list, keys_to_action.keys()),[])) + + if transpose: + video_size = env.observation_space.shape[1], env.observation_space.shape[0] + else: + video_size = env.observation_space.shape[0], env.observation_space.shape[1] + + if zoom is not None: + video_size = int(video_size[0] * zoom), int(video_size[1] * zoom) + + pressed_keys = [] + running = True + env_done = True + + screen = pygame.display.set_mode(video_size) + clock = pygame.time.Clock() + + + while running: + if env_done: + env_done = False + obs = env.reset() + else: + action = keys_to_action.get(tuple(sorted(pressed_keys)), 0) + prev_obs = obs + obs, rew, env_done, info = env.step(action) + if callback is not None: + callback(prev_obs, obs, action, rew, env_done, info) + if obs is not None: + if len(obs.shape) == 2: + obs = obs[:, :, None] + if obs.shape[2] == 1: + obs = obs.repeat(3, axis=2) + display_arr(screen, obs, transpose=transpose, video_size=video_size) + + # process pygame events + for event in pygame.event.get(): + # test events, set key states + if event.type == pygame.KEYDOWN: + if event.key in relevant_keys: + pressed_keys.append(event.key) + elif event.key == 27: + running = False + elif event.type == pygame.KEYUP: + if event.key in relevant_keys: + pressed_keys.remove(event.key) + elif event.type == pygame.QUIT: + running = False + elif event.type == VIDEORESIZE: + video_size = event.size + screen = pygame.display.set_mode(video_size) + print(video_size) + + pygame.display.flip() + clock.tick(fps) + pygame.quit() + +class PlayPlot(object): + def __init__(self, callback, horizon_timesteps, plot_names): + self.data_callback = callback + self.horizon_timesteps = horizon_timesteps + self.plot_names = plot_names + + num_plots = len(self.plot_names) + self.fig, self.ax = plt.subplots(num_plots) + if num_plots == 1: + self.ax = [self.ax] + for axis, name in zip(self.ax, plot_names): + axis.set_title(name) + self.t = 0 + self.cur_plot = [None for _ in range(num_plots)] + self.data = [deque(maxlen=horizon_timesteps) for _ in range(num_plots)] + + def callback(self, obs_t, obs_tp1, action, rew, done, info): + points = self.data_callback(obs_t, obs_tp1, action, rew, done, info) + for point, data_series in zip(points, self.data): + data_series.append(point) + self.t += 1 + + xmin, xmax = max(0, self.t - self.horizon_timesteps), self.t + + for i, plot in enumerate(self.cur_plot): + if plot is not None: + plot.remove() + self.cur_plot[i] = self.ax[i].scatter(range(xmin, xmax), list(self.data[i])) + self.ax[i].set_xlim(xmin, xmax) + plt.pause(0.000001) + + +if __name__ == '__main__': + env = gym.make("MontezumaRevengeNoFrameskip-v4") + play(env, zoom=4, fps=60) diff --git a/src/gym/utils/reraise.py b/src/gym/utils/reraise.py new file mode 100644 index 0000000..06a9029 --- /dev/null +++ b/src/gym/utils/reraise.py @@ -0,0 +1,41 @@ +import sys + +# We keep the actual reraising in different modules, since the +# reraising code uses syntax mutually exclusive to Python 2/3. +if sys.version_info[0] < 3: + from .reraise_impl_py2 import reraise_impl #pylint: disable=E0401 +else: + from .reraise_impl_py3 import reraise_impl + +def reraise(prefix=None, suffix=None): + old_exc_type, old_exc_value, traceback = sys.exc_info() + if old_exc_value is None: + old_exc_value = old_exc_type() + + e = ReraisedException(old_exc_value, prefix, suffix) + + reraise_impl(e, traceback) + +# http://stackoverflow.com/a/13653312 +def full_class_name(o): + module = o.__class__.__module__ + if module is None or module == str.__class__.__module__: + return o.__class__.__name__ + return module + '.' + o.__class__.__name__ + +class ReraisedException(Exception): + def __init__(self, old_exc, prefix, suffix): + self.old_exc = old_exc + self.prefix = prefix + self.suffix = suffix + + def __str__(self): + klass = self.old_exc.__class__ + + orig = "%s: %s" % (full_class_name(self.old_exc), klass.__str__(self.old_exc)) + prefixpart = suffixpart = '' + if self.prefix is not None: + prefixpart = self.prefix + "\n" + if self.suffix is not None: + suffixpart = "\n\n" + self.suffix + return "%sThe original exception was:\n\n%s%s" % (prefixpart, orig, suffixpart) diff --git a/src/gym/utils/reraise_impl_py2.py b/src/gym/utils/reraise_impl_py2.py new file mode 100644 index 0000000..9c55b0d --- /dev/null +++ b/src/gym/utils/reraise_impl_py2.py @@ -0,0 +1,2 @@ +def reraise_impl(e, traceback): + raise e.__class__, e, traceback diff --git a/src/gym/utils/reraise_impl_py3.py b/src/gym/utils/reraise_impl_py3.py new file mode 100644 index 0000000..1fc8db5 --- /dev/null +++ b/src/gym/utils/reraise_impl_py3.py @@ -0,0 +1,4 @@ +# http://stackoverflow.com/a/33822606 -- `from None` disables Python 3' +# semi-smart exception chaining, which we don't want in this case. +def reraise_impl(e, traceback): + raise e.with_traceback(traceback) from None diff --git a/src/gym/utils/seeding.py b/src/gym/utils/seeding.py new file mode 100644 index 0000000..39fe342 --- /dev/null +++ b/src/gym/utils/seeding.py @@ -0,0 +1,91 @@ +import hashlib +import numpy as np +import os +import random as _random +from six import integer_types +import struct +import sys + +from gym import error + +def np_random(seed=None): + if seed is not None and not (isinstance(seed, integer_types) and 0 <= seed): + raise error.Error('Seed must be a non-negative integer or omitted, not {}'.format(seed)) + + seed = create_seed(seed) + + rng = np.random.RandomState() + rng.seed(_int_list_from_bigint(hash_seed(seed))) + return rng, seed + +def hash_seed(seed=None, max_bytes=8): + """Any given evaluation is likely to have many PRNG's active at + once. (Most commonly, because the environment is running in + multiple processes.) There's literature indicating that having + linear correlations between seeds of multiple PRNG's can correlate + the outputs: + + http://blogs.unity3d.com/2015/01/07/a-primer-on-repeatable-random-numbers/ + http://stackoverflow.com/questions/1554958/how-different-do-random-seeds-need-to-be + http://dl.acm.org/citation.cfm?id=1276928 + + Thus, for sanity we hash the seeds before using them. (This scheme + is likely not crypto-strength, but it should be good enough to get + rid of simple correlations.) + + Args: + seed (Optional[int]): None seeds from an operating system specific randomness source. + max_bytes: Maximum number of bytes to use in the hashed seed. + """ + if seed is None: + seed = create_seed(max_bytes=max_bytes) + hash = hashlib.sha512(str(seed).encode('utf8')).digest() + return _bigint_from_bytes(hash[:max_bytes]) + +def create_seed(a=None, max_bytes=8): + """Create a strong random seed. Otherwise, Python 2 would seed using + the system time, which might be non-robust especially in the + presence of concurrency. + + Args: + a (Optional[int, str]): None seeds from an operating system specific randomness source. + max_bytes: Maximum number of bytes to use in the seed. + """ + # Adapted from https://svn.python.org/projects/python/tags/r32/Lib/random.py + if a is None: + a = _bigint_from_bytes(os.urandom(max_bytes)) + elif isinstance(a, str): + a = a.encode('utf8') + a += hashlib.sha512(a).digest() + a = _bigint_from_bytes(a[:max_bytes]) + elif isinstance(a, integer_types): + a = a % 2**(8 * max_bytes) + else: + raise error.Error('Invalid type for seed: {} ({})'.format(type(a), a)) + + return a + +# TODO: don't hardcode sizeof_int here +def _bigint_from_bytes(bytes): + sizeof_int = 4 + padding = sizeof_int - len(bytes) % sizeof_int + bytes += b'\0' * padding + int_count = int(len(bytes) / sizeof_int) + unpacked = struct.unpack("{}I".format(int_count), bytes) + accum = 0 + for i, val in enumerate(unpacked): + accum += 2 ** (sizeof_int * 8 * i) * val + return accum + +def _int_list_from_bigint(bigint): + # Special case 0 + if bigint < 0: + raise error.Error('Seed must be non-negative, not {}'.format(bigint)) + elif bigint == 0: + return [0] + + ints = [] + while bigint > 0: + bigint, mod = divmod(bigint, 2 ** 32) + ints.append(mod) + return ints diff --git a/src/gym/utils/tests/test_atexit.py b/src/gym/utils/tests/test_atexit.py new file mode 100644 index 0000000..bec6fba --- /dev/null +++ b/src/gym/utils/tests/test_atexit.py @@ -0,0 +1,21 @@ +from gym.utils.closer import Closer + +class Closeable(object): + close_called = False + def close(self): + self.close_called = True + +def test_register_unregister(): + registry = Closer(atexit_register=False) + c1 = Closeable() + c2 = Closeable() + + assert not c1.close_called + assert not c2.close_called + registry.register(c1) + id2 = registry.register(c2) + + registry.unregister(id2) + registry.close() + assert c1.close_called + assert not c2.close_called diff --git a/src/gym/utils/tests/test_seeding.py b/src/gym/utils/tests/test_seeding.py new file mode 100644 index 0000000..12fa69b --- /dev/null +++ b/src/gym/utils/tests/test_seeding.py @@ -0,0 +1,16 @@ +from gym import error +from gym.utils import seeding + +def test_invalid_seeds(): + for seed in [-1, 'test']: + try: + seeding.np_random(seed) + except error.Error: + pass + else: + assert False, 'Invalid seed {} passed validation'.format(seed) + +def test_valid_seeds(): + for seed in [0, 1]: + random, seed1 = seeding.np_random(seed) + assert seed == seed1 diff --git a/src/gym/version.py b/src/gym/version.py new file mode 100644 index 0000000..7244f26 --- /dev/null +++ b/src/gym/version.py @@ -0,0 +1 @@ +VERSION = '0.10.8' diff --git a/src/gym/wrappers/README.md b/src/gym/wrappers/README.md new file mode 100644 index 0000000..3f9ce25 --- /dev/null +++ b/src/gym/wrappers/README.md @@ -0,0 +1,26 @@ +# Wrappers + +Wrappers are used to transform an environment in a modular way: + +``` +env = gym.make('Pong-v0') +env = MyWrapper(env) +``` + +Note that we may later restructure any of the files in this directory, +but will keep the wrappers available at the wrappers' top-level +folder. So for example, you should access `MyWrapper` as follows: + +``` +# Will be supported in future releases +from gym.wrappers import MyWrapper +``` + +## Quick tips for writing your own wrapper + +- Don't forget to call super(class_name, self).__init__(env) if you override the wrapper's __init__ function +- You can access the inner environment with `self.unwrapped` +- You can access the previous layer using `self.env` +- The variables `metadata`, `action_space`, `observation_space`, `reward_range`, and `spec` are copied to `self` from the previous layer +- Create a wrapped function for at least one of the following: `__init__(self, env)`, `_step`, `_reset`, `_render`, `_close`, or `_seed` +- Your layered function should take its input from the previous layer (`self.env`) and/or the inner layer (`self.unwrapped`) diff --git a/src/gym/wrappers/__init__.py b/src/gym/wrappers/__init__.py new file mode 100644 index 0000000..cf0d21b --- /dev/null +++ b/src/gym/wrappers/__init__.py @@ -0,0 +1,4 @@ +from gym import error +from gym.wrappers.monitor import Monitor +from gym.wrappers.time_limit import TimeLimit +from gym.wrappers.dict import FlattenDictWrapper diff --git a/src/gym/wrappers/__pycache__/__init__.cpython-37.pyc b/src/gym/wrappers/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..415ff62 Binary files /dev/null and b/src/gym/wrappers/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/wrappers/__pycache__/dict.cpython-37.pyc b/src/gym/wrappers/__pycache__/dict.cpython-37.pyc new file mode 100644 index 0000000..30bf723 Binary files /dev/null and b/src/gym/wrappers/__pycache__/dict.cpython-37.pyc differ diff --git a/src/gym/wrappers/__pycache__/monitor.cpython-37.pyc b/src/gym/wrappers/__pycache__/monitor.cpython-37.pyc new file mode 100644 index 0000000..8e4ca61 Binary files /dev/null and b/src/gym/wrappers/__pycache__/monitor.cpython-37.pyc differ diff --git a/src/gym/wrappers/__pycache__/time_limit.cpython-37.pyc b/src/gym/wrappers/__pycache__/time_limit.cpython-37.pyc new file mode 100644 index 0000000..1216cc8 Binary files /dev/null and b/src/gym/wrappers/__pycache__/time_limit.cpython-37.pyc differ diff --git a/src/gym/wrappers/dict.py b/src/gym/wrappers/dict.py new file mode 100644 index 0000000..d583467 --- /dev/null +++ b/src/gym/wrappers/dict.py @@ -0,0 +1,28 @@ +import gym +import numpy as np + + +__all__ = ['FlattenDictWrapper'] + + +class FlattenDictWrapper(gym.ObservationWrapper): + """Flattens selected keys of a Dict observation space into + an array. + """ + def __init__(self, env, dict_keys): + super(FlattenDictWrapper, self).__init__(env) + self.dict_keys = dict_keys + + # Figure out observation_space dimension. + size = 0 + for key in dict_keys: + shape = self.env.observation_space.spaces[key].shape + size += np.prod(shape) + self.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(size,), dtype='float32') + + def observation(self, observation): + assert isinstance(observation, dict) + obs = [] + for key in self.dict_keys: + obs.append(observation[key].ravel()) + return np.concatenate(obs) diff --git a/src/gym/wrappers/monitor.py b/src/gym/wrappers/monitor.py new file mode 100644 index 0000000..87c83ab --- /dev/null +++ b/src/gym/wrappers/monitor.py @@ -0,0 +1,381 @@ +import gym +from gym import Wrapper +from gym import error, version, logger +import os, json, numpy as np, six +from gym.wrappers.monitoring import stats_recorder, video_recorder +from gym.utils import atomic_write, closer +from gym.utils.json_utils import json_encode_np + +FILE_PREFIX = 'openaigym' +MANIFEST_PREFIX = FILE_PREFIX + '.manifest' + +class Monitor(Wrapper): + def __init__(self, env, directory, video_callable=None, force=False, resume=False, + write_upon_reset=False, uid=None, mode=None): + super(Monitor, self).__init__(env) + + self.videos = [] + + self.stats_recorder = None + self.video_recorder = None + self.enabled = False + self.episode_id = 0 + self._monitor_id = None + self.env_semantics_autoreset = env.metadata.get('semantics.autoreset') + + self._start(directory, video_callable, force, resume, + write_upon_reset, uid, mode) + + def step(self, action): + self._before_step(action) + observation, reward, done, info = self.env.step(action) + done = self._after_step(observation, reward, done, info) + + return observation, reward, done, info + + def reset(self, **kwargs): + self._before_reset() + observation = self.env.reset(**kwargs) + self._after_reset(observation) + + return observation + + def close(self): + super(Monitor, self)._close() + + # _monitor will not be set if super(Monitor, self).__init__ raises, this check prevents a confusing error message + if getattr(self, '_monitor', None): + self.close() + + def set_monitor_mode(self, mode): + logger.info("Setting the monitor mode is deprecated and will be removed soon") + self._set_mode(mode) + + + def _start(self, directory, video_callable=None, force=False, resume=False, + write_upon_reset=False, uid=None, mode=None): + """Start monitoring. + + Args: + directory (str): A per-training run directory where to record stats. + video_callable (Optional[function, False]): function that takes in the index of the episode and outputs a boolean, indicating whether we should record a video on this episode. The default (for video_callable is None) is to take perfect cubes, capped at 1000. False disables video recording. + force (bool): Clear out existing training data from this directory (by deleting every file prefixed with "openaigym."). + resume (bool): Retain the training data already in this directory, which will be merged with our new data + write_upon_reset (bool): Write the manifest file on each reset. (This is currently a JSON file, so writing it is somewhat expensive.) + uid (Optional[str]): A unique id used as part of the suffix for the file. By default, uses os.getpid(). + mode (['evaluation', 'training']): Whether this is an evaluation or training episode. + """ + if self.env.spec is None: + logger.warn("Trying to monitor an environment which has no 'spec' set. This usually means you did not create it via 'gym.make', and is recommended only for advanced users.") + env_id = '(unknown)' + else: + env_id = self.env.spec.id + + if not os.path.exists(directory): + logger.info('Creating monitor directory %s', directory) + if six.PY3: + os.makedirs(directory, exist_ok=True) + else: + os.makedirs(directory) + + if video_callable is None: + video_callable = capped_cubic_video_schedule + elif video_callable == False: + video_callable = disable_videos + elif not callable(video_callable): + raise error.Error('You must provide a function, None, or False for video_callable, not {}: {}'.format(type(video_callable), video_callable)) + self.video_callable = video_callable + + # Check on whether we need to clear anything + if force: + clear_monitor_files(directory) + elif not resume: + training_manifests = detect_training_manifests(directory) + if len(training_manifests) > 0: + raise error.Error('''Trying to write to monitor directory {} with existing monitor files: {}. + + You should use a unique directory for each training run, or use 'force=True' to automatically clear previous monitor files.'''.format(directory, ', '.join(training_manifests[:5]))) + + self._monitor_id = monitor_closer.register(self) + + self.enabled = True + self.directory = os.path.abspath(directory) + # We use the 'openai-gym' prefix to determine if a file is + # ours + self.file_prefix = FILE_PREFIX + self.file_infix = '{}.{}'.format(self._monitor_id, uid if uid else os.getpid()) + + self.stats_recorder = stats_recorder.StatsRecorder(directory, '{}.episode_batch.{}'.format(self.file_prefix, self.file_infix), autoreset=self.env_semantics_autoreset, env_id=env_id) + + if not os.path.exists(directory): os.mkdir(directory) + self.write_upon_reset = write_upon_reset + + if mode is not None: + self._set_mode(mode) + + def _flush(self, force=False): + """Flush all relevant monitor information to disk.""" + if not self.write_upon_reset and not force: + return + + self.stats_recorder.flush() + + # Give it a very distiguished name, since we need to pick it + # up from the filesystem later. + path = os.path.join(self.directory, '{}.manifest.{}.manifest.json'.format(self.file_prefix, self.file_infix)) + logger.debug('Writing training manifest file to %s', path) + with atomic_write.atomic_write(path) as f: + # We need to write relative paths here since people may + # move the training_dir around. It would be cleaner to + # already have the basenames rather than basename'ing + # manually, but this works for now. + json.dump({ + 'stats': os.path.basename(self.stats_recorder.path), + 'videos': [(os.path.basename(v), os.path.basename(m)) + for v, m in self.videos], + 'env_info': self._env_info(), + }, f, default=json_encode_np) + + def close(self): + """Flush all monitor data to disk and close any open rending windows.""" + if not self.enabled: + return + self.stats_recorder.close() + if self.video_recorder is not None: + self._close_video_recorder() + self._flush(force=True) + + # Stop tracking this for autoclose + monitor_closer.unregister(self._monitor_id) + self.enabled = False + + logger.info('''Finished writing results. You can upload them to the scoreboard via gym.upload(%r)''', self.directory) + + def _set_mode(self, mode): + if mode == 'evaluation': + type = 'e' + elif mode == 'training': + type = 't' + else: + raise error.Error('Invalid mode {}: must be "training" or "evaluation"', mode) + self.stats_recorder.type = type + + def _before_step(self, action): + if not self.enabled: return + self.stats_recorder.before_step(action) + + def _after_step(self, observation, reward, done, info): + if not self.enabled: return done + + if done and self.env_semantics_autoreset: + # For envs with BlockingReset wrapping VNCEnv, this observation will be the first one of the new episode + self.reset_video_recorder() + self.episode_id += 1 + self._flush() + + # Record stats + self.stats_recorder.after_step(observation, reward, done, info) + # Record video + self.video_recorder.capture_frame() + + return done + + def _before_reset(self): + if not self.enabled: return + self.stats_recorder.before_reset() + + def _after_reset(self, observation): + if not self.enabled: return + + # Reset the stat count + self.stats_recorder.after_reset(observation) + + self.reset_video_recorder() + + # Bump *after* all reset activity has finished + self.episode_id += 1 + + self._flush() + + def reset_video_recorder(self): + # Close any existing video recorder + if self.video_recorder: + self._close_video_recorder() + + # Start recording the next video. + # + # TODO: calculate a more correct 'episode_id' upon merge + self.video_recorder = video_recorder.VideoRecorder( + env=self.env, + base_path=os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.episode_id)), + metadata={'episode_id': self.episode_id}, + enabled=self._video_enabled(), + ) + self.video_recorder.capture_frame() + + def _close_video_recorder(self): + self.video_recorder.close() + if self.video_recorder.functional: + self.videos.append((self.video_recorder.path, self.video_recorder.metadata_path)) + + def _video_enabled(self): + return self.video_callable(self.episode_id) + + def _env_info(self): + env_info = { + 'gym_version': version.VERSION, + } + if self.env.spec: + env_info['env_id'] = self.env.spec.id + return env_info + + def __del__(self): + # Make sure we've closed up shop when garbage collecting + self.close() + + def get_total_steps(self): + return self.stats_recorder.total_steps + + def get_episode_rewards(self): + return self.stats_recorder.episode_rewards + + def get_episode_lengths(self): + return self.stats_recorder.episode_lengths + +def detect_training_manifests(training_dir, files=None): + if files is None: + files = os.listdir(training_dir) + return [os.path.join(training_dir, f) for f in files if f.startswith(MANIFEST_PREFIX + '.')] + +def detect_monitor_files(training_dir): + return [os.path.join(training_dir, f) for f in os.listdir(training_dir) if f.startswith(FILE_PREFIX + '.')] + +def clear_monitor_files(training_dir): + files = detect_monitor_files(training_dir) + if len(files) == 0: + return + + logger.info('Clearing %d monitor files from previous run (because force=True was provided)', len(files)) + for file in files: + os.unlink(file) + +def capped_cubic_video_schedule(episode_id): + if episode_id < 1000: + return int(round(episode_id ** (1. / 3))) ** 3 == episode_id + else: + return episode_id % 1000 == 0 + +def disable_videos(episode_id): + return False + +monitor_closer = closer.Closer() + +# This method gets used for a sanity check in scoreboard/api.py. It's +# not intended for use outside of the gym codebase. +def _open_monitors(): + return list(monitor_closer.closeables.values()) + +def load_env_info_from_manifests(manifests, training_dir): + env_infos = [] + for manifest in manifests: + with open(manifest) as f: + contents = json.load(f) + env_infos.append(contents['env_info']) + + env_info = collapse_env_infos(env_infos, training_dir) + return env_info + +def load_results(training_dir): + if not os.path.exists(training_dir): + logger.error('Training directory %s not found', training_dir) + return + + manifests = detect_training_manifests(training_dir) + if not manifests: + logger.error('No manifests found in training directory %s', training_dir) + return + + logger.debug('Uploading data from manifest %s', ', '.join(manifests)) + + # Load up stats + video files + stats_files = [] + videos = [] + env_infos = [] + + for manifest in manifests: + with open(manifest) as f: + contents = json.load(f) + # Make these paths absolute again + stats_files.append(os.path.join(training_dir, contents['stats'])) + videos += [(os.path.join(training_dir, v), os.path.join(training_dir, m)) + for v, m in contents['videos']] + env_infos.append(contents['env_info']) + + env_info = collapse_env_infos(env_infos, training_dir) + data_sources, initial_reset_timestamps, timestamps, episode_lengths, episode_rewards, episode_types, initial_reset_timestamp = merge_stats_files(stats_files) + + return { + 'manifests': manifests, + 'env_info': env_info, + 'data_sources': data_sources, + 'timestamps': timestamps, + 'episode_lengths': episode_lengths, + 'episode_rewards': episode_rewards, + 'episode_types': episode_types, + 'initial_reset_timestamps': initial_reset_timestamps, + 'initial_reset_timestamp': initial_reset_timestamp, + 'videos': videos, + } + +def merge_stats_files(stats_files): + timestamps = [] + episode_lengths = [] + episode_rewards = [] + episode_types = [] + initial_reset_timestamps = [] + data_sources = [] + + for i, path in enumerate(stats_files): + with open(path) as f: + content = json.load(f) + if len(content['timestamps'])==0: continue # so empty file doesn't mess up results, due to null initial_reset_timestamp + data_sources += [i] * len(content['timestamps']) + timestamps += content['timestamps'] + episode_lengths += content['episode_lengths'] + episode_rewards += content['episode_rewards'] + # Recent addition + episode_types += content.get('episode_types', []) + # Keep track of where each episode came from. + initial_reset_timestamps.append(content['initial_reset_timestamp']) + + idxs = np.argsort(timestamps) + timestamps = np.array(timestamps)[idxs].tolist() + episode_lengths = np.array(episode_lengths)[idxs].tolist() + episode_rewards = np.array(episode_rewards)[idxs].tolist() + data_sources = np.array(data_sources)[idxs].tolist() + + if episode_types: + episode_types = np.array(episode_types)[idxs].tolist() + else: + episode_types = None + + if len(initial_reset_timestamps) > 0: + initial_reset_timestamp = min(initial_reset_timestamps) + else: + initial_reset_timestamp = 0 + + return data_sources, initial_reset_timestamps, timestamps, episode_lengths, episode_rewards, episode_types, initial_reset_timestamp + +# TODO training_dir isn't used except for error messages, clean up the layering +def collapse_env_infos(env_infos, training_dir): + assert len(env_infos) > 0 + + first = env_infos[0] + for other in env_infos[1:]: + if first != other: + raise error.Error('Found two unequal env_infos: {} and {}. This usually indicates that your training directory {} has commingled results from multiple runs.'.format(first, other, training_dir)) + + for key in ['env_id', 'gym_version']: + if key not in first: + raise error.Error("env_info {} from training directory {} is missing expected key {}. This is unexpected and likely indicates a bug in gym.".format(first, training_dir, key)) + return first \ No newline at end of file diff --git a/src/gym/wrappers/monitoring/__init__.py b/src/gym/wrappers/monitoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/wrappers/monitoring/__pycache__/__init__.cpython-37.pyc b/src/gym/wrappers/monitoring/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..286a5e7 Binary files /dev/null and b/src/gym/wrappers/monitoring/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/gym/wrappers/monitoring/__pycache__/stats_recorder.cpython-37.pyc b/src/gym/wrappers/monitoring/__pycache__/stats_recorder.cpython-37.pyc new file mode 100644 index 0000000..2df8bbd Binary files /dev/null and b/src/gym/wrappers/monitoring/__pycache__/stats_recorder.cpython-37.pyc differ diff --git a/src/gym/wrappers/monitoring/__pycache__/video_recorder.cpython-37.pyc b/src/gym/wrappers/monitoring/__pycache__/video_recorder.cpython-37.pyc new file mode 100644 index 0000000..c49284c Binary files /dev/null and b/src/gym/wrappers/monitoring/__pycache__/video_recorder.cpython-37.pyc differ diff --git a/src/gym/wrappers/monitoring/stats_recorder.py b/src/gym/wrappers/monitoring/stats_recorder.py new file mode 100644 index 0000000..998c89b --- /dev/null +++ b/src/gym/wrappers/monitoring/stats_recorder.py @@ -0,0 +1,103 @@ +import json +import os +import time + +from gym import error +from gym.utils import atomic_write +from gym.utils.json_utils import json_encode_np + +class StatsRecorder(object): + def __init__(self, directory, file_prefix, autoreset=False, env_id=None): + self.autoreset = autoreset + self.env_id = env_id + + self.initial_reset_timestamp = None + self.directory = directory + self.file_prefix = file_prefix + self.episode_lengths = [] + self.episode_rewards = [] + self.episode_types = [] # experimental addition + self._type = 't' + self.timestamps = [] + self.steps = None + self.total_steps = 0 + self.rewards = None + + self.done = None + self.closed = False + + filename = '{}.stats.json'.format(self.file_prefix) + self.path = os.path.join(self.directory, filename) + + @property + def type(self): + return self._type + + @type.setter + def type(self, type): + if type not in ['t', 'e']: + raise error.Error('Invalid episode type {}: must be t for training or e for evaluation', type) + self._type = type + + def before_step(self, action): + assert not self.closed + + if self.done: + raise error.ResetNeeded("Trying to step environment which is currently done. While the monitor is active for {}, you cannot step beyond the end of an episode. Call 'env.reset()' to start the next episode.".format(self.env_id)) + elif self.steps is None: + raise error.ResetNeeded("Trying to step an environment before reset. While the monitor is active for {}, you must call 'env.reset()' before taking an initial step.".format(self.env_id)) + + def after_step(self, observation, reward, done, info): + self.steps += 1 + self.total_steps += 1 + self.rewards += reward + self.done = done + + if done: + self.save_complete() + + if done: + if self.autoreset: + self.before_reset() + self.after_reset(observation) + + def before_reset(self): + assert not self.closed + + if self.done is not None and not self.done and self.steps > 0: + raise error.Error("Tried to reset environment which is not done. While the monitor is active for {}, you cannot call reset() unless the episode is over.".format(self.env_id)) + + self.done = False + if self.initial_reset_timestamp is None: + self.initial_reset_timestamp = time.time() + + def after_reset(self, observation): + self.steps = 0 + self.rewards = 0 + # We write the type at the beginning of the episode. If a user + # changes the type, it's more natural for it to apply next + # time the user calls reset(). + self.episode_types.append(self._type) + + def save_complete(self): + if self.steps is not None: + self.episode_lengths.append(self.steps) + self.episode_rewards.append(float(self.rewards)) + self.timestamps.append(time.time()) + + def close(self): + self.flush() + self.closed = True + + def flush(self): + if self.closed: + return + + with atomic_write.atomic_write(self.path) as f: + json.dump({ + 'initial_reset_timestamp': self.initial_reset_timestamp, + 'timestamps': self.timestamps, + 'episode_lengths': self.episode_lengths, + 'episode_rewards': self.episode_rewards, + 'episode_types': self.episode_types, + }, f, default=json_encode_np) diff --git a/src/gym/wrappers/monitoring/tests/__init__.py b/src/gym/wrappers/monitoring/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/wrappers/monitoring/tests/helpers.py b/src/gym/wrappers/monitoring/tests/helpers.py new file mode 100644 index 0000000..4c57385 --- /dev/null +++ b/src/gym/wrappers/monitoring/tests/helpers.py @@ -0,0 +1,9 @@ +import contextlib +import shutil +import tempfile + +@contextlib.contextmanager +def tempdir(): + temp = tempfile.mkdtemp() + yield temp + shutil.rmtree(temp) diff --git a/src/gym/wrappers/monitoring/tests/test_video_recorder.py b/src/gym/wrappers/monitoring/tests/test_video_recorder.py new file mode 100644 index 0000000..5a4a11d --- /dev/null +++ b/src/gym/wrappers/monitoring/tests/test_video_recorder.py @@ -0,0 +1,65 @@ +import json +import os +import shutil +import tempfile +import numpy as np + +import gym +from gym.wrappers.monitoring.video_recorder import VideoRecorder + +class BrokenRecordableEnv(object): + metadata = {'render.modes': [None, 'rgb_array']} + + def render(self, mode=None): + pass + +class UnrecordableEnv(object): + metadata = {'render.modes': [None]} + + def render(self, mode=None): + pass + +def test_record_simple(): + env = gym.make("CartPole-v1") + rec = VideoRecorder(env) + env.reset() + rec.capture_frame() + rec.close() + assert not rec.empty + assert not rec.broken + assert os.path.exists(rec.path) + f = open(rec.path) + assert os.fstat(f.fileno()).st_size > 100 + +def test_no_frames(): + env = BrokenRecordableEnv() + rec = VideoRecorder(env) + rec.close() + assert rec.empty + assert rec.functional + assert not os.path.exists(rec.path) + +def test_record_unrecordable_method(): + env = UnrecordableEnv() + rec = VideoRecorder(env) + assert not rec.enabled + rec.close() + +def test_record_breaking_render_method(): + env = BrokenRecordableEnv() + rec = VideoRecorder(env) + rec.capture_frame() + rec.close() + assert rec.empty + assert rec.broken + assert not os.path.exists(rec.path) + +def test_text_envs(): + env = gym.make('FrozenLake-v0') + video = VideoRecorder(env) + try: + env.reset() + video.capture_frame() + video.close() + finally: + os.remove(video.path) diff --git a/src/gym/wrappers/monitoring/video_recorder.py b/src/gym/wrappers/monitoring/video_recorder.py new file mode 100644 index 0000000..280b366 --- /dev/null +++ b/src/gym/wrappers/monitoring/video_recorder.py @@ -0,0 +1,309 @@ +import json +import os +import subprocess +import tempfile +import os.path +import distutils.spawn, distutils.version +import numpy as np +from six import StringIO +import six +from gym import error, logger + +def touch(path): + open(path, 'a').close() + +class VideoRecorder(object): + """VideoRecorder renders a nice movie of a rollout, frame by frame. It + comes with an `enabled` option so you can still use the same code + on episodes where you don't want to record video. + + Note: + You are responsible for calling `close` on a created + VideoRecorder, or else you may leak an encoder process. + + Args: + env (Env): Environment to take video of. + path (Optional[str]): Path to the video file; will be randomly chosen if omitted. + base_path (Optional[str]): Alternatively, path to the video file without extension, which will be added. + metadata (Optional[dict]): Contents to save to the metadata file. + enabled (bool): Whether to actually record video, or just no-op (for convenience) + """ + + def __init__(self, env, path=None, metadata=None, enabled=True, base_path=None): + modes = env.metadata.get('render.modes', []) + self._async = env.metadata.get('semantics.async') + self.enabled = enabled + + # Don't bother setting anything else if not enabled + if not self.enabled: + return + + self.ansi_mode = False + if 'rgb_array' not in modes: + if 'ansi' in modes: + self.ansi_mode = True + else: + logger.info('Disabling video recorder because {} neither supports video mode "rgb_array" nor "ansi".'.format(env)) + # Whoops, turns out we shouldn't be enabled after all + self.enabled = False + return + + if path is not None and base_path is not None: + raise error.Error("You can pass at most one of `path` or `base_path`.") + + self.last_frame = None + self.env = env + + required_ext = '.json' if self.ansi_mode else '.mp4' + if path is None: + if base_path is not None: + # Base path given, append ext + path = base_path + required_ext + else: + # Otherwise, just generate a unique filename + with tempfile.NamedTemporaryFile(suffix=required_ext, delete=False) as f: + path = f.name + self.path = path + + path_base, actual_ext = os.path.splitext(self.path) + + if actual_ext != required_ext: + hint = " HINT: The environment is text-only, therefore we're recording its text output in a structured JSON format." if self.ansi_mode else '' + raise error.Error("Invalid path given: {} -- must have file extension {}.{}".format(self.path, required_ext, hint)) + # Touch the file in any case, so we know it's present. (This + # corrects for platform platform differences. Using ffmpeg on + # OS X, the file is precreated, but not on Linux. + touch(path) + + self.frames_per_sec = env.metadata.get('video.frames_per_second', 30) + self.encoder = None # lazily start the process + self.broken = False + + # Dump metadata + self.metadata = metadata or {} + self.metadata['content_type'] = 'video/vnd.openai.ansivid' if self.ansi_mode else 'video/mp4' + self.metadata_path = '{}.meta.json'.format(path_base) + self.write_metadata() + + logger.info('Starting new video recorder writing to %s', self.path) + self.empty = True + + @property + def functional(self): + return self.enabled and not self.broken + + def capture_frame(self): + """Render the given `env` and add the resulting frame to the video.""" + if not self.functional: return + logger.debug('Capturing video frame: path=%s', self.path) + + render_mode = 'ansi' if self.ansi_mode else 'rgb_array' + frame = self.env.render(mode=render_mode) + + if frame is None: + if self._async: + return + else: + # Indicates a bug in the environment: don't want to raise + # an error here. + logger.warn('Env returned None on render(). Disabling further rendering for video recorder by marking as disabled: path=%s metadata_path=%s', self.path, self.metadata_path) + self.broken = True + else: + self.last_frame = frame + if self.ansi_mode: + self._encode_ansi_frame(frame) + else: + self._encode_image_frame(frame) + + def close(self): + """Make sure to manually close, or else you'll leak the encoder process""" + if not self.enabled: + return + + if self.encoder: + logger.debug('Closing video encoder: path=%s', self.path) + self.encoder.close() + self.encoder = None + else: + # No frames captured. Set metadata, and remove the empty output file. + os.remove(self.path) + + if self.metadata is None: + self.metadata = {} + self.metadata['empty'] = True + + # If broken, get rid of the output file, otherwise we'd leak it. + if self.broken: + logger.info('Cleaning up paths for broken video recorder: path=%s metadata_path=%s', self.path, self.metadata_path) + + # Might have crashed before even starting the output file, don't try to remove in that case. + if os.path.exists(self.path): + os.remove(self.path) + + if self.metadata is None: + self.metadata = {} + self.metadata['broken'] = True + + self.write_metadata() + + def write_metadata(self): + with open(self.metadata_path, 'w') as f: + json.dump(self.metadata, f) + + def _encode_ansi_frame(self, frame): + if not self.encoder: + self.encoder = TextEncoder(self.path, self.frames_per_sec) + self.metadata['encoder_version'] = self.encoder.version_info + self.encoder.capture_frame(frame) + self.empty = False + + def _encode_image_frame(self, frame): + if not self.encoder: + self.encoder = ImageEncoder(self.path, frame.shape, self.frames_per_sec) + self.metadata['encoder_version'] = self.encoder.version_info + + try: + self.encoder.capture_frame(frame) + except error.InvalidFrame as e: + logger.warn('Tried to pass invalid video frame, marking as broken: %s', e) + self.broken = True + else: + self.empty = False + + +class TextEncoder(object): + """Store a moving picture made out of ANSI frames. Format adapted from + https://github.com/asciinema/asciinema/blob/master/doc/asciicast-v1.md""" + + def __init__(self, output_path, frames_per_sec): + self.output_path = output_path + self.frames_per_sec = frames_per_sec + self.frames = [] + + def capture_frame(self, frame): + string = None + if isinstance(frame, str): + string = frame + elif isinstance(frame, StringIO): + string = frame.getvalue() + else: + raise error.InvalidFrame('Wrong type {} for {}: text frame must be a string or StringIO'.format(type(frame), frame)) + + frame_bytes = string.encode('utf-8') + + if frame_bytes[-1:] != six.b('\n'): + raise error.InvalidFrame('Frame must end with a newline: """{}"""'.format(string)) + + if six.b('\r') in frame_bytes: + raise error.InvalidFrame('Frame contains carriage returns (only newlines are allowed: """{}"""'.format(string)) + + self.frames.append(frame_bytes) + + def close(self): + #frame_duration = float(1) / self.frames_per_sec + frame_duration = .5 + + # Turn frames into events: clear screen beforehand + # https://rosettacode.org/wiki/Terminal_control/Clear_the_screen#Python + # https://rosettacode.org/wiki/Terminal_control/Cursor_positioning#Python + clear_code = six.b("%c[2J\033[1;1H" % (27)) + # Decode the bytes as UTF-8 since JSON may only contain UTF-8 + events = [ (frame_duration, (clear_code+frame.replace(six.b('\n'),six.b('\r\n'))).decode('utf-8')) for frame in self.frames ] + + # Calculate frame size from the largest frames. + # Add some padding since we'll get cut off otherwise. + height = max([frame.count(six.b('\n')) for frame in self.frames]) + 1 + width = max([max([len(line) for line in frame.split(six.b('\n'))]) for frame in self.frames]) + 2 + + data = { + "version": 1, + "width": width, + "height": height, + "duration": len(self.frames)*frame_duration, + "command": "-", + "title": "gym VideoRecorder episode", + "env": {}, # could add some env metadata here + "stdout": events, + } + + with open(self.output_path, 'w') as f: + json.dump(data, f) + + @property + def version_info(self): + return {'backend':'TextEncoder','version':1} + +class ImageEncoder(object): + def __init__(self, output_path, frame_shape, frames_per_sec): + self.proc = None + self.output_path = output_path + # Frame shape should be lines-first, so w and h are swapped + h, w, pixfmt = frame_shape + if pixfmt != 3 and pixfmt != 4: + raise error.InvalidFrame("Your frame has shape {}, but we require (w,h,3) or (w,h,4), i.e. RGB values for a w-by-h image, with an optional alpha channl.".format(frame_shape)) + self.wh = (w,h) + self.includes_alpha = (pixfmt == 4) + self.frame_shape = frame_shape + self.frames_per_sec = frames_per_sec + + if distutils.spawn.find_executable('avconv') is not None: + self.backend = 'avconv' + elif distutils.spawn.find_executable('ffmpeg') is not None: + self.backend = 'ffmpeg' + else: + raise error.DependencyNotInstalled("""Found neither the ffmpeg nor avconv executables. On OS X, you can install ffmpeg via `brew install ffmpeg`. On most Ubuntu variants, `sudo apt-get install ffmpeg` should do it. On Ubuntu 14.04, however, you'll need to install avconv with `sudo apt-get install libav-tools`.""") + + self.start() + + @property + def version_info(self): + return { + 'backend':self.backend, + 'version':str(subprocess.check_output([self.backend, '-version'], + stderr=subprocess.STDOUT)), + 'cmdline':self.cmdline + } + + def start(self): + self.cmdline = (self.backend, + '-nostats', + '-loglevel', 'error', # suppress warnings + '-y', + '-r', '%d' % self.frames_per_sec, + + # input + '-f', 'rawvideo', + '-s:v', '{}x{}'.format(*self.wh), + '-pix_fmt',('rgb32' if self.includes_alpha else 'rgb24'), + '-i', '-', # this used to be /dev/stdin, which is not Windows-friendly + + # output + '-vcodec', 'libx264', + '-pix_fmt', 'yuv420p', + self.output_path + ) + + logger.debug('Starting ffmpeg with "%s"', ' '.join(self.cmdline)) + if hasattr(os,'setsid'): #setsid not present on Windows + self.proc = subprocess.Popen(self.cmdline, stdin=subprocess.PIPE, preexec_fn=os.setsid) + else: + self.proc = subprocess.Popen(self.cmdline, stdin=subprocess.PIPE) + + def capture_frame(self, frame): + if not isinstance(frame, (np.ndarray, np.generic)): + raise error.InvalidFrame('Wrong type {} for {} (must be np.ndarray or np.generic)'.format(type(frame), frame)) + if frame.shape != self.frame_shape: + raise error.InvalidFrame("Your frame has shape {}, but the VideoRecorder is configured for shape {}.".format(frame.shape, self.frame_shape)) + if frame.dtype != np.uint8: + raise error.InvalidFrame("Your frame has data type {}, but we require uint8 (i.e. RGB values from 0-255).".format(frame.dtype)) + + if distutils.version.LooseVersion(np.__version__) >= distutils.version.LooseVersion('1.9.0'): + self.proc.stdin.write(frame.tobytes()) + else: + self.proc.stdin.write(frame.tostring()) + + def close(self): + self.proc.stdin.close() + ret = self.proc.wait() + if ret != 0: + logger.error("VideoRecorder encoder exited with status {}".format(ret)) diff --git a/src/gym/wrappers/tests/__init__.py b/src/gym/wrappers/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gym/wrappers/time_limit.py b/src/gym/wrappers/time_limit.py new file mode 100644 index 0000000..5784ca0 --- /dev/null +++ b/src/gym/wrappers/time_limit.py @@ -0,0 +1,44 @@ +import time +from gym import Wrapper, logger + +class TimeLimit(Wrapper): + def __init__(self, env, max_episode_seconds=None, max_episode_steps=None): + super(TimeLimit, self).__init__(env) + self._max_episode_seconds = max_episode_seconds + self._max_episode_steps = max_episode_steps + + self._elapsed_steps = 0 + self._episode_started_at = None + + @property + def _elapsed_seconds(self): + return time.time() - self._episode_started_at + + def _past_limit(self): + """Return true if we are past our limit""" + if self._max_episode_steps is not None and self._max_episode_steps <= self._elapsed_steps: + logger.debug("Env has passed the step limit defined by TimeLimit.") + return True + + if self._max_episode_seconds is not None and self._max_episode_seconds <= self._elapsed_seconds: + logger.debug("Env has passed the seconds limit defined by TimeLimit.") + return True + + return False + + def step(self, action): + assert self._episode_started_at is not None, "Cannot call env.step() before calling reset()" + observation, reward, done, info = self.env.step(action) + self._elapsed_steps += 1 + + if self._past_limit(): + if self.metadata.get('semantics.autoreset'): + _ = self.reset() # automatically reset the env + done = True + + return observation, reward, done, info + + def reset(self): + self._episode_started_at = time.time() + self._elapsed_steps = 0 + return self.env.reset() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..54f53fe --- /dev/null +++ b/src/main.py @@ -0,0 +1,87 @@ +import asyncio +from dataclasses import dataclass +from enum import Enum, auto + +import numpy as np + + +async def handle_example_connection(reader, writer): + data = await reader.read(100) + message = data.decode() + addr = writer.get_extra_info('peername') + print(f"Received {message!r} from {addr!r}") + + print(f"Send: {message!r}") + writer.write(data) + await writer.drain() + + print("Close the client socket") + writer.close() + + + + + +async def handle_pilesos_connection(reader, writer): + writer.write( + b"""Hey, you. You're finally awake. You were trying to cross the border, right? Walked right into that Imperial ambush, same as us, and that thief over there.\n""") + writer.write(b'What is your name?\n') + await writer.drain() + + name = (await reader.readline()).decode().strip() + + writer.write(f'Hi {name}\n'.encode()) + writer.write(b"""Protocol is as follows:\n""") + writer.write(b"""- You get three bits of sensor data:\n""") + writer.write(b""" - Whether you bumped into something earlier\n""") + writer.write(b""" - Whether there is dirt at your position\n""") + writer.write(b""" - Whether you are home\n""") + writer.write(b"""- Each tick you can output a command (one letter):\n""") + writer.write(b""" - F -- go forward one cell\n""") + writer.write(b""" - R -- turn right by 90 degrees\n""") + writer.write(b""" - L -- turn left by 90 degrees\n""") + writer.write(b""" - S -- suck up dirt\n""") + writer.write(b""" - T -- turn off and finish\n""") + writer.write(b"""- After you finish, you will receive your score, and this connection will be closed.\n""") + await writer.drain() + + map_state = MapState(MAP) + + writer.write(f'{map_state.get_current_obs()}\n'.encode()) + await writer.drain() + + while True: + action = (await reader.readline()).decode().strip() + + try: + result = map_state.act(action) + except Exception as e: + writer.write(f'{e}\n'.encode()) + await writer.drain() + continue + + writer.write(f'{result}\n'.encode()) + await writer.drain() + + if action == 'T': + break + + writer.write(f'Your score is {map_state.score}\n'.encode()) + writer.write(f'Well done!\n'.encode()) + await writer.drain() + + writer.close() + + +async def run_server(): + server = await asyncio.start_server(handle_pilesos_connection, '0.0.0.0', 5555) + + addr = server.sockets[0].getsockname() + print(f'Serving on {addr}') + + async with server: + await server.serve_forever() + + +if __name__ == '__main__': + asyncio.run(run_server()) diff --git a/src/pilesos/__init__.py b/src/pilesos/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pilesos/__pycache__/__init__.cpython-37.pyc b/src/pilesos/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..db29783 Binary files /dev/null and b/src/pilesos/__pycache__/__init__.cpython-37.pyc differ diff --git a/src/pilesos/__pycache__/mapstate.cpython-37.pyc b/src/pilesos/__pycache__/mapstate.cpython-37.pyc new file mode 100644 index 0000000..c5923d0 Binary files /dev/null and b/src/pilesos/__pycache__/mapstate.cpython-37.pyc differ diff --git a/src/pilesos/__pycache__/pilesos.cpython-37.pyc b/src/pilesos/__pycache__/pilesos.cpython-37.pyc new file mode 100644 index 0000000..cde120d Binary files /dev/null and b/src/pilesos/__pycache__/pilesos.cpython-37.pyc differ diff --git a/src/pilesos/__pycache__/shirokii.cpython-37.pyc b/src/pilesos/__pycache__/shirokii.cpython-37.pyc new file mode 100644 index 0000000..ad5c9bf Binary files /dev/null and b/src/pilesos/__pycache__/shirokii.cpython-37.pyc differ diff --git a/src/pilesos/mapstate.py b/src/pilesos/mapstate.py new file mode 100644 index 0000000..3678f07 --- /dev/null +++ b/src/pilesos/mapstate.py @@ -0,0 +1,154 @@ +from enum import Enum, auto +from typing import Tuple + +import numpy as np + +# MAP = """ +# S.#.. +# ..#.. +# x..x. +# ..#.. +# ..#.. +# """ + +MAP = """ + Sxx.. + .##xx + ..xxx +""" + +REWARD_EVERY_ACTION = -1 +REWARD_SUCKED_DUST = 100 +REWARD_FINISHED_NOT_AT_HOME = -1000 + +directions = np.array([ + [0, 1], + [1, 0], + [0, -1], + [-1, 0], +], dtype=int) + + +class CellState(Enum): + EMPTY = auto() + WALL = auto() + STARTING = auto() + DIRT = auto() + + @staticmethod + def from_char(c): + return { + '.': CellState.EMPTY, + '#': CellState.WALL, + 'S': CellState.STARTING, + 'x': CellState.DIRT, + }[c] + + +class Action(Enum): + FORWARD = auto() + LEFT = auto() + RIGHT = auto() + SUCK = auto() + DIE = auto() + + @staticmethod + def from_char(c): + return { + 'F': Action.FORWARD, + 'L': Action.LEFT, + 'R': Action.RIGHT, + 'S': Action.SUCK, + 'D': Action.DIE, + }[c] + + +class Observation: + has_bumped: bool + has_dirt: bool + is_home: bool + + def __init__(self, has_bumped, has_dirt, is_home): + self.has_bumped = has_bumped + self.has_dirt = has_dirt + self.is_home = is_home + + def __repr__(self): + return f'{self.has_bumped:b}{self.has_dirt:b}{self.is_home:b}' + + +class MapState: + n: int + m: int + map: np.ndarray + + starting_pos: np.ndarray + + current_pos: np.ndarray + current_direction: int + + last_reward: int + + @staticmethod + def default(): + return MapState(MAP) + + def __init__(self, s: str): + lines = s.split() + + self.n = len(lines) + self.m = len(lines[0]) + + self.map = np.zeros((self.n, self.m), dtype=object) + + for i in range(self.n): + for j in range(self.m): + self.map[i, j] = CellState.from_char(lines[i][j]) + + if self.map[i, j] == CellState.STARTING: + self.starting_pos = np.array([i, j], dtype=int) + self.current_pos = self.starting_pos + self.current_direction = 0 + + def get_current_obs(self, has_bumped=False): + current_cell = self.map[self.current_pos[0], self.current_pos[1]] + + return Observation(has_bumped=has_bumped, + has_dirt=current_cell == CellState.DIRT, + is_home=current_cell == CellState.STARTING) + + def act(self, action: Action) -> Tuple[Observation, int, bool]: + reward = REWARD_EVERY_ACTION + + if action == Action.FORWARD: + next_pos = self.current_pos + directions[self.current_direction] + + if 0 <= next_pos[0] < self.n and 0 <= next_pos[1] < self.m and \ + self.map[next_pos[0], next_pos[1]] != CellState.WALL: + self.current_pos = next_pos + return self.get_current_obs(), reward, False + + return self.get_current_obs(has_bumped=True), reward, False + if action == Action.RIGHT: + self.current_direction += 1 + self.current_direction %= 4 + + return self.get_current_obs(), reward, False + elif action == Action.LEFT: + self.current_direction -= 1 + self.current_direction %= 4 + + return self.get_current_obs(), reward, False + elif action == Action.SUCK: + if self.map[self.current_pos[0], self.current_pos[1]] == CellState.DIRT: + self.map[self.current_pos[0], self.current_pos[1]] = CellState.EMPTY + reward += REWARD_SUCKED_DUST + + return self.get_current_obs(), reward, False + elif action == Action.DIE: + if self.map[self.current_pos[0], self.current_pos[1]] != CellState.STARTING: + reward += REWARD_FINISHED_NOT_AT_HOME + + return self.get_current_obs(), reward, True + else: + raise ValueError(f'Invalid action {action}') diff --git a/src/pilesos/pilesos.py b/src/pilesos/pilesos.py new file mode 100644 index 0000000..82575ca --- /dev/null +++ b/src/pilesos/pilesos.py @@ -0,0 +1,6 @@ +from pilesos.mapstate import Action, Observation + + +class Pilesos: + def decide(self, observation: Observation, prev_reward: int) -> Action: + raise NotImplementedError() diff --git a/src/pilesos/shirokii.py b/src/pilesos/shirokii.py new file mode 100644 index 0000000..57d4a90 --- /dev/null +++ b/src/pilesos/shirokii.py @@ -0,0 +1,239 @@ +import collections +from typing import Dict, Tuple + +import numpy as np +from enum import Enum, auto + +from pilesos.mapstate import Action, Observation +from pilesos.pilesos import Pilesos + +directions = np.array([ + [0, 1], + [1, 0], + [0, -1], + [-1, 0], +], dtype=int) + + +def add(pos, dir_i): + return tuple(np.array(pos, dtype=int) + directions[dir_i]) + + +def bfs_neighbours(pos, dir_i): + a = [ + (Action.FORWARD, add(pos, dir_i), dir_i), + (Action.LEFT, pos, (dir_i + 3) % 4), + (Action.RIGHT, pos, (dir_i + 1) % 4), + ] + # np.random.shuffle(a) + return a + + +class CellState(Enum): + HOME = auto() + UNKNOWN = auto() + EMPTY = auto() + WALL = auto() + + +class Shirokii(Pilesos): + map: Dict[Tuple[int, int], CellState] + + pos: Tuple[int, int] + dir_i: int + + prev_pos: Tuple[int, int] + prev_action: Action + + bfs_distances: Dict[Tuple[Tuple[int, int], int], int] + bfs_came_by: Dict[Tuple[Tuple[int, int], int], Action] + bfs_came_from: Dict[Tuple[Tuple[int, int], int], Tuple[Tuple[int, int], int]] + + def __init__(self): + self.pos = (0, 0) + self.dir_i = 0 + + self.map = { + self.pos: CellState.HOME, + } + for dir_i in [0, 1]: + self.map[add(self.pos, dir_i)] = CellState.UNKNOWN + + for i in range(0, 15): + self.map[(-1, i)] = CellState.WALL + self.map[(i, -1)] = CellState.WALL + + self.prev_action = None + + self.bfs_distances = None + self.bfs_came_by = None + self.bfs_came_from = None + + def print(self): + min_x, min_y = +1e9, +1e9 + max_x, max_y = -1e9, -1e9 + + for pos, cell_state in self.map.items(): + min_x = min(min_x, pos[0]) + min_y = min(min_y, pos[1]) + + max_x = max(max_x, pos[0]) + max_y = max(max_y, pos[1]) + + print('===============') + + for x in range(min_x, max_x + 1): + for y in range(min_y, max_y + 1): + cell_state = self.map.get((x, y)) + + c = ' ' + if cell_state == CellState.HOME: + c = 'H' + if cell_state == CellState.EMPTY: + c = '.' + if cell_state == CellState.WALL: + c = '#' + if cell_state == CellState.UNKNOWN: + c = '?' + + if (x, y) == self.pos: + if self.dir_i == 0: + c = '→' + if self.dir_i == 1: + c = '↓' + if self.dir_i == 2: + c = '←' + if self.dir_i == 3: + c = '↑' + + print(c, end='') + print() + + print('===============') + + def bfs(self): + q = collections.deque() + start_state = (tuple(self.pos), self.dir_i) + q.append(start_state) + + np.random.seed(0) + + self.bfs_distances = { + start_state: 0 + } + + self.bfs_came_by = { + start_state: None + } + + self.bfs_came_from = { + start_state: None + } + + while len(q) > 0: + state = q.popleft() + + pos, dir_i = state + + if self.map[pos] == CellState.UNKNOWN: + continue + + for action, new_pos, new_dir_i in bfs_neighbours(pos, dir_i): + new_state = new_pos, new_dir_i + + if new_state in self.bfs_distances: + continue + + if self.map[new_pos] == CellState.WALL: + continue + + self.bfs_distances[new_state] = self.bfs_distances[state] + 1 + self.bfs_came_by[new_state] = action + self.bfs_came_from[new_state] = state + + q.append(new_state) + + def how_to_go_to_state(self, state): + last_action = None + + while True: + pos, dir_i = state + if state == (self.pos, self.dir_i): + return last_action + + last_action = self.bfs_came_by[state] + state = self.bfs_came_from[state] + + def how_to_go_to_closest_state(self, desired_cell_state): + min_distance = 1e9 + min_state = None + max_dist2 = -1e9 + + for pos, cell_state in self.map.items(): + if cell_state != desired_cell_state: + continue + + cur_states = [(pos, dir_i) for dir_i in range(4)] + + cur_distances = [self.bfs_distances.get(state, 1e9) for state in cur_states] + + min_i = np.argmin(cur_distances) + cur_distance = cur_distances[min_i] + + cur_dist2 = pos[0] ** 2 + pos[1] ** 2 + + if cur_distance < min_distance: + min_distance = cur_distance + min_state = cur_states[min_i] + max_dist2 = cur_dist2 + elif cur_distance == min_distance and max_dist2 < cur_dist2: + min_distance = cur_distance + min_state = cur_states[min_i] + max_dist2 = cur_dist2 + + if min_state is None: + return None + + return self.how_to_go_to_state(min_state) + + def decide(self, observation: Observation, prev_reward: int) -> Action: + if self.prev_action == Action.FORWARD: + new_pos = add(self.pos, self.dir_i) + + if observation.has_bumped: + self.map[new_pos] = CellState.WALL + else: + if self.map[new_pos] == CellState.UNKNOWN: + self.map[new_pos] = CellState.EMPTY + + self.pos = new_pos + + for dir_i in range(4): + new_pos2 = add(self.pos, dir_i) + if new_pos2 not in self.map: + self.map[new_pos2] = CellState.UNKNOWN + + if self.prev_action == Action.LEFT: + self.dir_i = (self.dir_i + 3) % 4 + + if self.prev_action == Action.RIGHT: + self.dir_i = (self.dir_i + 1) % 4 + + if observation.has_dirt: + self.prev_action = Action.SUCK + return Action.SUCK + + self.bfs() + + action_to_unknown = self.how_to_go_to_closest_state(CellState.UNKNOWN) + if action_to_unknown is not None: + self.prev_action = action_to_unknown + return action_to_unknown + + action_to_home = self.how_to_go_to_closest_state(CellState.HOME) + if action_to_home is not None: + self.prev_action = action_to_home + return action_to_home + + self.prev_action = Action.DIE + return Action.DIE diff --git a/src/server/__init__.py b/src/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/server/server.py b/src/server/server.py new file mode 100644 index 0000000..ceac29d --- /dev/null +++ b/src/server/server.py @@ -0,0 +1 @@ +# def serve(socket) diff --git a/src/simulate.py b/src/simulate.py new file mode 100644 index 0000000..efb93ec --- /dev/null +++ b/src/simulate.py @@ -0,0 +1,50 @@ +from pilesos.mapstate import MapState +from pilesos.shirokii import Shirokii + +# MAP = """ +# S.x#x +# ##.#x +# xxx.x +# """ + +MAP = """ + S....... + .....x.. + ........ + ###..### + ......x. + ........ + .x#.x#x. + x.#..#.. +""" + + +def main(): + map_state = MapState(MAP) + pilesos = Shirokii() + + obs = map_state.get_current_obs() + reward = 0 + score = 0 + + while True: + action = pilesos.decide(obs, reward) + + pilesos.print() + print() + print(f'Observation = {obs}') + print(f'Reward = {reward}') + print(f'Score = {score}') + print(f'Action = {action}') + + obs, reward, has_finished = map_state.act(action) + score += reward + + if has_finished: + break + + print(score) + + +if __name__ == '__main__': + main() diff --git a/src/testenv.py b/src/testenv.py new file mode 100644 index 0000000..7115ea4 --- /dev/null +++ b/src/testenv.py @@ -0,0 +1,50 @@ +from pilesos.mapstate import MapState +from pilesos.shirokii import Shirokii + +# MAP = """ +# S.x#x +# ##.#x +# xxx.x +# """ + +MAP = """ + S.......... + ..#######.. + ..#.....#.. + ..#.#x#.#.. + ..#.###.#.. + ..#.....#.. + ..####.##.. + ..........x +""" + + +def main(): + map_state = MapState(MAP) + pilesos = Shirokii() + + obs = map_state.get_current_obs() + reward = 0 + score = 0 + + while True: + action = pilesos.decide(obs, reward) + + pilesos.print() + print() + print(f'Observation = {obs}') + print(f'Reward = {reward}') + print(f'Score = {score}') + print(f'Action = {action}') + + obs, reward, has_finished = map_state.act(action) + score += reward + + if has_finished: + break + + print(score) + + +if __name__ == '__main__': + main()