diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index d76c7d0cd6e4..7e0ed08e4763 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -24,7 +24,7 @@ add_subdirectory(conv_layer)
 add_subdirectory(cuda_mat_mul)
 add_subdirectory(depthwise_separable_conv)
 add_subdirectory(fft)
-# add_subdirectory(glsl)  # TODO(#4937): bugged; not built by Makefile
+add_subdirectory(glsl)
 add_subdirectory(harris)
 # add_subdirectory(hexagon_benchmarks)  # TODO(#5374): missing CMake build
 # add_subdirectory(hexagon_dma)  # TODO(#5374): missing CMake build
diff --git a/apps/glsl/CMakeLists.txt b/apps/glsl/CMakeLists.txt
index 5db30f5e3fd6..e9a8a5f13765 100644
--- a/apps/glsl/CMakeLists.txt
+++ b/apps/glsl/CMakeLists.txt
@@ -16,6 +16,13 @@ set(CMAKE_CXX_EXTENSIONS NO)
 # Find Halide
 find_package(Halide REQUIRED)
 
+find_package(OpenGL REQUIRED)
+set(opengl_features opengl)
+if (TARGET OpenGL::OpenGL AND TARGET OpenGL::EGL)
+    # EGL requires GLVND (which is found iff ::OpenGL is present)
+    list(APPEND opengl_features egl)
+endif ()
+
 # Generators
 add_executable(glsl_blur.generator halide_blur_glsl_generator.cpp)
 target_link_libraries(glsl_blur.generator PRIVATE Halide::Generator)
@@ -24,8 +31,8 @@ add_executable(ycc.generator halide_ycc_glsl_generator.cpp)
 target_link_libraries(ycc.generator PRIVATE Halide::Generator)
 
 # Libraries
-add_halide_library(halide_blur_glsl FROM glsl_blur.generator FEATURES opengl debug)
-add_halide_library(halide_ycc_glsl FROM ycc.generator FEATURES opengl debug)
+add_halide_library(halide_blur_glsl FROM glsl_blur.generator FEATURES ${opengl_features} debug)
+add_halide_library(halide_ycc_glsl FROM ycc.generator FEATURES ${opengl_features} debug)
 
 # Final executable
 add_executable(opengl_test opengl_test.cpp)
diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index 5ad44d31b766..220f1f56ceb8 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -343,21 +343,18 @@ endfunction()
 
 function(_Halide_target_link_gpu_libs TARGET VISIBILITY)
     if ("${ARGN}" MATCHES "opengl")
-        if (NOT TARGET X11::X11)
-            find_package(X11)
-            if (NOT X11_FOUND)
-                message(AUTHOR_WARNING "X11 dependency not found on system.")
+        if ("${ARGN}" MATCHES "egl")
+            find_package(OpenGL REQUIRED COMPONENTS OpenGL EGL)
+            target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::OpenGL OpenGL::EGL)
+        else ()
+            if ("${ARGN}" MATCHES "linux" OR ("${ARGN}" MATCHES "host" AND Halide_HOST_TARGET MATCHES "linux"))
+                find_package(X11 REQUIRED)
+                target_link_libraries(${TARGET} ${VISIBILITY} X11::X11)
             endif ()
-        endif ()
-        target_link_libraries(${TARGET} ${VISIBILITY} X11::X11)
 
-        if (NOT TARGET OpenGL::GL)
-            find_package(OpenGL QUIET)
-            if (NOT OPENGL_FOUND)
-                message(AUTHOR_WARNING "OpenGL dependency not found on system.")
-            endif ()
+            find_package(OpenGL REQUIRED)
+            target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::GL)
         endif ()
-        target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::GL)
     endif ()
 
     if ("${ARGN}" MATCHES "metal")
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index 0c248de3f8ec..d2aa48596f19 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -277,26 +277,6 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
         // Determine the arguments that must be passed into the halide function
         vector<DeviceArgument> closure_args = c.arguments();
 
-        // Sort the args by the size of the underlying type. This is
-        // helpful for avoiding struct-packing ambiguities in metal,
-        // which passes the scalar args as a struct.
-        std::sort(closure_args.begin(), closure_args.end(),
-                  [](const DeviceArgument &a, const DeviceArgument &b) {
-                      if (a.is_buffer == b.is_buffer) {
-                          return a.type.bits() > b.type.bits();
-                      } else {
-                          // Ensure that buffer arguments come first:
-                          // for many OpenGL/Compute systems, the
-                          // legal indices for buffer args are much
-                          // more restrictive than for scalar args,
-                          // and scalar args can be 'grown' by
-                          // LICM. Putting buffers first makes it much
-                          // more likely we won't fail on some
-                          // hardware.
-                          return a.is_buffer > b.is_buffer;
-                      }
-                  });
-
         // Halide allows passing of scalar float and integer arguments. For
         // OpenGL, pack these into vec4 uniforms and varying attributes
         if (loop->device_api == DeviceAPI::GLSL) {
@@ -320,6 +300,26 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
                     closure_args[i].packed_index = num_uniform_ints++;
                 }
             }
+        } else {
+            // Sort the args by the size of the underlying type. This is
+            // helpful for avoiding struct-packing ambiguities in metal,
+            // which passes the scalar args as a struct.
+            std::sort(closure_args.begin(), closure_args.end(),
+                      [](const DeviceArgument &a, const DeviceArgument &b) {
+                          if (a.is_buffer == b.is_buffer) {
+                              return a.type.bits() > b.type.bits();
+                          } else {
+                              // Ensure that buffer arguments come first:
+                              // for many OpenGL/Compute systems, the
+                              // legal indices for buffer args are much
+                              // more restrictive than for scalar args,
+                              // and scalar args can be 'grown' by
+                              // LICM. Putting buffers first makes it much
+                              // more likely we won't fail on some
+                              // hardware.
+                              return a.is_buffer > b.is_buffer;
+                          }
+                      });
         }
 
         for (size_t i = 0; i < closure_args.size(); i++) {
diff --git a/src/CodeGen_OpenGL_Dev.cpp b/src/CodeGen_OpenGL_Dev.cpp
index d0fb7dc2b885..333d837eb64b 100644
--- a/src/CodeGen_OpenGL_Dev.cpp
+++ b/src/CodeGen_OpenGL_Dev.cpp
@@ -25,7 +25,8 @@ bool is_opengl_es(const Target &target) {
     // versions (desktop GL, GLES2, GLES3, ...), probably by making it part of
     // Target.
     return (target.os == Target::Android ||
-            target.os == Target::IOS);
+            target.os == Target::IOS) ||
+           target.has_feature(Target::EGL);
 }
 
 char get_lane_suffix(int i) {
@@ -134,7 +135,20 @@ Type CodeGen_GLSLBase::map_type(const Type &type) {
         } else if (type.is_int() && type.bits() <= 32) {
             result = Int(32);
         } else if (type.is_uint() && type.bits() <= 32) {
-            result = UInt(32);
+            if (support_native_uint) {
+                result = UInt(32);
+            } else {
+                if (type.bits() == 32) {
+                    // GLSL <= 120 doesn't have unsigned types, simply use int.
+                    // WARNING: Using int to represent unsigned int may result in
+                    // overflows and undefined behavior.
+                    result = Int(32);
+                } else {
+                    // Embed all other uints in a GLSL float. Probably not actually
+                    // valid for uint16 on systems with low float precision.
+                    result = Float(32);
+                }
+            }
         } else {
             user_error << "GLSL: Can't represent type '" << type << "'.\n";
         }
@@ -175,8 +189,10 @@ void CodeGen_GLSLBase::visit(const UIntImm *op) {
         } else {
             id = "false";
         }
-    } else {
+    } else if (support_native_uint) {
         id = std::to_string(op->value) + "u";
+    } else {
+        id = print_type(op->type) + "(" + std::to_string(op->value) + ")";
     }
 }
 
@@ -244,7 +260,7 @@ void CodeGen_GLSLBase::visit(const Call *op) {
         internal_assert(op->args.size() == 2);
         // Simply discard the first argument, which is generally a call to
         // 'halide_printf'.
-        print_expr(op->args[1]);
+        print_assignment(op->type, print_expr(op->args[1]));
         return;
     } else if (op->name == "fast_inverse_f32") {
         print_expr(make_one(op->type) / op->args[0]);
@@ -297,15 +313,40 @@ void CodeGen_GLSLBase::visit(const Call *op) {
             user_error << "GLSL: unknown function '" << op->name << "' encountered.\n";
         }
 
-        rhs << builtin[op->name] << "(";
-        for (size_t i = 0; i < op->args.size(); i++) {
-            if (i > 0) {
-                rhs << ", ";
+        bool need_cast = false;
+        const Type float_type = Float(32, op->type.lanes());
+        vector<Expr> new_args(op->args.size());
+
+        // For GL 2.0, Most GLSL builtins are only defined for float arguments,
+        // so we may have to introduce type casts around the arguments and the
+        // entire function call.
+        if (!support_int_to_float_implicit_conversion &&
+            !support_non_float_type_builtin.count(op->name)) {
+            need_cast = !op->type.is_float();
+            for (size_t i = 0; i < op->args.size(); i++) {
+                if (!op->args[i].type().is_float()) {
+                    new_args[i] = Cast::make(float_type, op->args[i]);
+                    need_cast = true;
+                } else {
+                    new_args[i] = op->args[i];
+                }
             }
-            rhs << print_expr(op->args[i]);
         }
-        rhs << ")";
-        print_assignment(op->type, rhs.str());
+
+        if (need_cast) {
+            Expr val = Call::make(float_type, op->name, new_args, op->call_type);
+            print_expr(simplify(Cast::make(op->type, val)));
+        } else {
+            rhs << builtin[op->name] << "(";
+            for (size_t i = 0; i < op->args.size(); i++) {
+                if (i > 0) {
+                    rhs << ", ";
+                }
+                rhs << print_expr(op->args[i]);
+            }
+            rhs << ")";
+            print_assignment(op->type, rhs.str());
+        }
     }
 }
 
@@ -459,6 +500,64 @@ void CodeGen_GLSLBase::visit(const Cast *op) {
 CodeGen_GLSL::CodeGen_GLSL(std::ostream &s, const Target &t)
     : CodeGen_GLSLBase(s, t) {
     builtin["trunc_f32"] = "_trunc_f32";
+
+    // TODO: Add emulation for these builtin functions
+    //       which are available only for GL 3.x (GLSL >= 130)
+    builtin.erase("isnan");
+    builtin.erase("round_f32");
+    builtin.erase("sinh_f32");
+    builtin.erase("cosh_f32");
+    builtin.erase("tanh_f32");
+    builtin.erase("asinh_f32");
+    builtin.erase("acosh_f32");
+    builtin.erase("atanh_f32");
+
+    // TODO: Check OpenGL version then determine support_* variables value
+    support_native_uint = false;
+    support_int_to_float_implicit_conversion = false;
+    support_integer_division_rounding = false;
+    // functions that support ivecs
+    support_non_float_type_builtin.insert("equal");
+    support_non_float_type_builtin.insert("notEqual");
+    support_non_float_type_builtin.insert("lessThan");
+    support_non_float_type_builtin.insert("lessThanEqual");
+    support_non_float_type_builtin.insert("greaterThan");
+    support_non_float_type_builtin.insert("greaterThanEqual");
+}
+
+// Copy back from commit #60442cf9eb
+void CodeGen_GLSL::visit(const Div *op) {
+    if (!support_integer_division_rounding && (op->type.is_int() || op->type.is_uint())) {
+        // Halide's integer division is defined to round according to
+        // the sign of the denominator. Since the rounding behavior of
+        // GLSL's integer division is undefined, emulate the correct
+        // behavior using floating point arithmetic.
+        Type float_type = Float(32, op->type.lanes());
+        // To avoid rounding woes, aim for a floating point value that
+        // should not be close to an integer. If we divide the range
+        // [0, 1, 2, 3] by 4, we want to get floating point values
+        // [1/8, 3/8, 5/8, 7/8]. This can be achieved by adding 0.5 to
+        // the numerator.
+        Expr val = Div::make(Cast::make(float_type, op->a) + 0.5f, Cast::make(float_type, op->b));
+        string float_result = print_expr(simplify(val));
+        val = Variable::make(float_type, float_result);
+        Expr zero = make_zero(op->type);
+        string a = print_expr(op->a);
+        string b = print_expr(op->b);
+        Expr a_var = is_const(op->a) ? op->a : Variable::make(op->type, a);
+        Expr b_var = is_const(op->b) ? op->b : Variable::make(op->type, b);
+        Expr equiv = select(b_var == zero, zero,
+                            b_var > zero, Call::make(op->type, "floor_f32", {val}, Call::Extern),
+                            Call::make(op->type, "ceil_f32", {val}, Call::Extern));
+        if (op->type.bits() >= 32) {
+            // A float isn't precise enough to produce the correct int
+            // in the case where the denominator is one.
+            equiv = select(b_var == make_one(op->type), a_var, equiv);
+        }
+        print_expr(simplify(equiv));
+    } else {
+        CodeGen_GLSLBase::visit(op);
+    }
 }
 
 void CodeGen_GLSL::visit(const Let *op) {
@@ -683,6 +782,10 @@ void CodeGen_GLSL::visit(const Call *op) {
         internal_assert((op->type.code() == Type::UInt || op->type.code() == Type::Float) &&
                         (op->type.lanes() >= 1 && op->type.lanes() <= 4));
 
+        if (op->type.is_uint()) {
+            rhs << print_type(op->type) << "(floor(";
+        }
+
         if (op->type.is_vector()) {
             // The channel argument must be a ramp or a broadcast of a constant.
             Expr c = op->args[4];
@@ -745,7 +848,7 @@ void CodeGen_GLSL::visit(const Call *op) {
         }
 
         if (op->type.is_uint()) {
-            rhs << " * " << print_expr(cast<float>(op->type.max()));
+            rhs << " * " << print_expr(cast<float>(op->type.max())) << " + 0.5))";
         }
 
     } else if (op->is_intrinsic(Call::glsl_texture_store)) {
@@ -919,12 +1022,12 @@ void CodeGen_GLSL::add_kernel(const Stmt &stmt, const string &name,
             ++num_varying_floats;
         } else if (args[i].type.is_float()) {
             header << "/// UNIFORM "
-                   << CodeGen_GLSLBase::print_type(args[i].type) << " "
+                   << CodeGen_C::print_type(args[i].type) << " "  // NOLINT: Allow call to CodeGen_C::print_type
                    << print_name(args[i].name) << " uniformf" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
             ++num_uniform_floats;
         } else if (args[i].type.is_int()) {
             header << "/// UNIFORM "
-                   << CodeGen_GLSLBase::print_type(args[i].type) << " "
+                   << CodeGen_C::print_type(args[i].type) << " "  // NOLINT: Allow call to CodeGen_C::print_type
                    << print_name(args[i].name) << " uniformi" << args[i].packed_index / 4 << "[" << args[i].packed_index % 4 << "]\n";
             ++num_uniform_ints;
         }
@@ -1023,6 +1126,8 @@ void check(Expr e, const string &result) {
         // wrap them to obtain useful output.
         e = Halide::print(e);
     }
+    source.str("");
+    source.clear();
     Evaluate::make(e).accept(&cg);
     string src = normalize_temporaries(source.str());
     if (!ends_with(src, result)) {
@@ -1072,14 +1177,15 @@ void CodeGen_GLSL::test() {
     check(Variable::make(Int(32), "x") / Expr(3),
           "float $ = float($x);\n"
           "float $ = $ * 0.333333343;\n"
+          "float $ = $ + 0.166666672;\n"
           "float $ = floor($);\n"
           "int $ = int($);\n");
-    check(Variable::make(Int(32, 4), "x") / Variable::make(Int(32, 4), "y"),
-          "vec4 $ = vec4($x);\n"
-          "vec4 $ = vec4($y);\n"
-          "vec4 $ = $ / $;\n"
-          "vec4 $ = floor($);\n"
-          "ivec4 $ = ivec4($);\n");
+    // check(Variable::make(Int(32, 4), "x") / Variable::make(Int(32, 4), "y"),
+    //       "vec4 $ = vec4($x);\n"
+    //       "vec4 $ = vec4($y);\n"
+    //       "vec4 $ = $ / $;\n"
+    //       "vec4 $ = floor($);\n"
+    //       "ivec4 $ = ivec4($);\n");
     check(Variable::make(Float(32, 4), "x") / Variable::make(Float(32, 4), "y"),
           "vec4 $ = $x / $y;\n");
 
@@ -1113,19 +1219,21 @@ void CodeGen_GLSL::test() {
           "vec4 $ = sin($);\n");
 
     // use float version of abs in GLSL
-    check(abs(-2),
-          "float $ = abs(-2.0);\n"
+    check(abs(Variable::make(Int(32), "x")),
+          "float $ = float($x);\n"
+          "float $ = abs($);\n"
           "int $ = int($);\n");
 
     check(Halide::print(3.0f), "float $ = 3.0;\n");
 
     // Test rounding behavior of integer division.
-    check(Variable::make(Int(32), "x") / Variable::make(Int(32), "y"),
-          "float $ = float($x);\n"
-          "float $ = float($y);\n"
-          "float $ = $ / $;\n"
-          "float $ = floor($);\n"
-          "int $ = int($);\n");
+    // The latest version of integer division is too complicated to list here
+    // check(Variable::make(Int(32), "x") / Variable::make(Int(32), "y"),
+    //       "float $ = float($x);\n"
+    //       "float $ = float($y);\n"
+    //       "float $ = $ / $;\n"
+    //       "float $ = floor($);\n"
+    //       "int $ = int($);\n");
 
     // Select with scalar condition
     check(Select::make(EQ::make(Variable::make(Float(32), "x"), 1.0f),
@@ -1156,7 +1264,7 @@ void CodeGen_GLSL::test() {
                              Broadcast::make(0, 4),
                              Ramp::make(0, 1, 4)},
                             Call::Intrinsic);
-    check(load4, "vec4 $ = texture2D($buf, vec2(0, 0));\n");
+    check(load4, "vec4 $ = texture2D($buf, vec2(int(0), int(0)));\n");
 
     check(log(1.0f), "float $ = log(1.0);\n");
     check(exp(1.0f), "float $ = exp(1.0);\n");
@@ -1165,7 +1273,7 @@ void CodeGen_GLSL::test() {
     check(pow(1.4f, 2), "float $ = 1.39999998 * 1.39999998;\n");
     check(pow(1.0f, 2.1f), "float $ = pow(1.0, 2.0999999);\n");
 
-    std::cout << "CodeGen_GLSL test passed\n";
+    std::cout << "CodeGen_GLSL test Success!\n";
 }
 
 }  // namespace Internal
diff --git a/src/CodeGen_OpenGL_Dev.h b/src/CodeGen_OpenGL_Dev.h
index 35069466219b..03cf43e1a1c8 100644
--- a/src/CodeGen_OpenGL_Dev.h
+++ b/src/CodeGen_OpenGL_Dev.h
@@ -6,6 +6,7 @@
  */
 
 #include <map>
+#include <set>
 #include <sstream>
 
 #include "CodeGen_C.h"
@@ -87,6 +88,25 @@ class CodeGen_GLSLBase : public CodeGen_C {
     Type map_type(const Type &);
 
     std::map<std::string, std::string> builtin;
+
+    // empty for GL 3.x and GLCompute which do not care about this (due to implicit conversion)
+    // while GL 2.0 only support a small subset of builtin functions with ivec arguments
+    std::set<std::string> support_non_float_type_builtin;
+
+    // true for GL 3.x (GLSL >= 130 or ESSL >= 300) and GLCompute
+    // false for GL 2.x which does not support uint/uvec
+    bool support_native_uint = true;
+
+    // true for GL 2.1 and 3.x (GLSL == 120, >= 130) and GLCompute
+    // true for GL ES 3.1 with EXT_shader_implicit_conversions
+    // false for GL 2.0 and GL ES 3.0
+    bool support_int_to_float_implicit_conversion = true;
+
+    // it seems that only GLSL ES implicitly does not support rounding of integer division
+    // while GLSL specification does not talk about this issue
+    // see GLSL ES Specification 1.00, issues 10.28, Rounding of Integer Division
+    // see GLSL ES Specification 3.00, issues 12.33, Rounding of Integer Division
+    bool support_integer_division_rounding = true;
 };
 
 /** Compile one statement into GLSL. */
@@ -103,6 +123,8 @@ class CodeGen_GLSL : public CodeGen_GLSLBase {
 protected:
     using CodeGen_GLSLBase::visit;
 
+    void visit(const Div *) override;
+
     void visit(const Let *) override;
     void visit(const For *) override;
     void visit(const Select *) override;
diff --git a/src/InjectOpenGLIntrinsics.cpp b/src/InjectOpenGLIntrinsics.cpp
index 1a96cb6bff35..b9e1d8c3fa46 100644
--- a/src/InjectOpenGLIntrinsics.cpp
+++ b/src/InjectOpenGLIntrinsics.cpp
@@ -42,12 +42,15 @@ class InjectOpenGLIntrinsics : public IRMutator {
             //                   c - c_min, c_extent
             //                   )
             //
+            int dims = (call_args.size() - 2) / 2;
+            internal_assert(dims >= 1 && dims <= 3);
+
             vector<Expr> args(5);
             args[0] = call_args[0];  // "name"
             args[1] = call_args[1];  // name.buffer
 
             // Normalize first two coordinates.
-            for (size_t i = 0; i < 2; i++) {
+            for (int i = 0; i < std::min(dims, 2); i++) {
                 int to_index = 2 + i;
                 int from_index = 2 + i * 2;
                 args[to_index] =
@@ -55,20 +58,25 @@ class InjectOpenGLIntrinsics : public IRMutator {
                     mutate(call_args[from_index + 1]);
             }
 
-            // Confirm that user explicitly specified constant value for min
-            // value of c dimension for ImageParams accessed by GLSL-based filters.
-            if (call->param.defined()) {
-                bool const_min_constraint =
-                    call->param.min_constraint(2).defined() &&
-                    is_const(call->param.min_constraint(2));
-                user_assert(const_min_constraint)
-                    << "GLSL: Requires minimum for c-dimension set to constant "
-                    << "for ImageParam '" << args[0] << "'. "
-                    << "Call set_min(2, min) or set_bounds(2, min, extent) to set.\n";
-            }
+            if (dims < 3) {
+                args[3] = FloatImm::make(Float(32), 0.5f);
+                args[4] = IntImm::make(Int(32), 0);
+            } else {
+                // Confirm that user explicitly specified constant value for min
+                // value of c dimension for ImageParams accessed by GLSL-based filters.
+                if (call->param.defined()) {
+                    bool const_min_constraint =
+                        call->param.min_constraint(2).defined() &&
+                        is_const(call->param.min_constraint(2));
+                    user_assert(const_min_constraint)
+                        << "GLSL: Requires minimum for c-dimension set to constant "
+                        << "for ImageParam '" << args[0] << "'. "
+                        << "Call set_min(2, min) or set_bounds(2, min, extent) to set.\n";
+                }
 
-            Expr c_coordinate = mutate(call_args[2 + 2 * 2]);
-            args[4] = c_coordinate;
+                Expr c_coordinate = mutate(call_args[2 + 2 * 2]);
+                args[4] = c_coordinate;
+            }
 
             return Call::make(call->type, Call::glsl_texture_load,
                               vector<Expr>(&args[0], &args[5]),
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index 60eb4ec9620d..dd67ca1d2dcd 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -57,17 +57,31 @@ typedef struct CUctx_st *CUcontext;
 typedef struct cl_context_st *cl_context;
 typedef struct cl_command_queue_st *cl_command_queue;
 
-void load_opengl() {
+void load_opengl(bool needs_egl) {
 #if defined(__linux__)
     if (have_symbol("glXGetCurrentContext") && have_symbol("glDeleteTextures")) {
         debug(1) << "OpenGL support code already linked in...\n";
     } else {
         debug(1) << "Looking for OpenGL support code...\n";
         string error;
-        llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error);
-        user_assert(error.empty()) << "Could not find libGL.so\n";
-        llvm::sys::DynamicLibrary::LoadLibraryPermanently("libX11.so", &error);
-        user_assert(error.empty()) << "Could not find libX11.so\n";
+        if (needs_egl) {
+            // NVIDIA EGL prefers users to load libOpenGL.so instead of libGL.so
+            // The way we're using it, it seems like libGL.so.1 is a valid fallback.
+            // See here for more details: https://developer.nvidia.com/blog/linking-opengl-server-side-rendering
+            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libOpenGL.so.0", &error);
+            if (!error.empty()) {
+                debug(1) << "Could not find libOpenGL.so.0 when EGL requested. Falling back to libGL.so.1\n";
+                llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error);
+            }
+            user_assert(error.empty()) << "Could not find libOpenGL.so.0 or libGL.so.1\n";
+            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libEGL.so.1", &error);
+            user_assert(error.empty()) << "Could not find libEGL.so.1\n";
+        } else {
+            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error);
+            user_assert(error.empty()) << "Could not find libGL.so\n";
+            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libX11.so.6", &error);
+            user_assert(error.empty()) << "Could not find libX11.so.6\n";
+        }
     }
 #elif defined(__APPLE__)
     if (have_symbol("aglCreateContext") && have_symbol("glDeleteTextures")) {
@@ -692,23 +706,23 @@ JITModule &make_module(llvm::Module *for_module, Target target,
             one_gpu.set_feature(Target::Debug);
             one_gpu.set_feature(Target::OpenGL);
             module_name = "debug_opengl";
-            load_opengl();
+            load_opengl(one_gpu.has_feature(Target::EGL));
             break;
         case OpenGL:
             one_gpu.set_feature(Target::OpenGL);
             module_name += "opengl";
-            load_opengl();
+            load_opengl(one_gpu.has_feature(Target::EGL));
             break;
         case OpenGLComputeDebug:
             one_gpu.set_feature(Target::Debug);
             one_gpu.set_feature(Target::OpenGLCompute);
             module_name = "debug_openglcompute";
-            load_opengl();
+            load_opengl(one_gpu.has_feature(Target::EGL));
             break;
         case OpenGLCompute:
             one_gpu.set_feature(Target::OpenGLCompute);
             module_name += "openglcompute";
-            load_opengl();
+            load_opengl(one_gpu.has_feature(Target::EGL));
             break;
         case HexagonDebug:
             one_gpu.set_feature(Target::Debug);
diff --git a/src/runtime/opengl.cpp b/src/runtime/opengl.cpp
index f6b96c3d10bf..73964bfb64ee 100644
--- a/src/runtime/opengl.cpp
+++ b/src/runtime/opengl.cpp
@@ -299,6 +299,10 @@ WEAK void GLStateSaver::restore() {
     }
     free(texture_2d_binding);
 
+    if (global_state.have_vertex_array_objects) {
+        global_state.BindVertexArray(vertex_array_binding);
+    }
+
     for (int i = 0; i < max_vertex_attribs; i++) {
         if (vertex_attrib_array_enabled[i]) {
             global_state.EnableVertexAttribArray(i);
@@ -308,10 +312,6 @@ WEAK void GLStateSaver::restore() {
     }
     free(vertex_attrib_array_enabled);
 
-    if (global_state.have_vertex_array_objects) {
-        global_state.BindVertexArray(vertex_array_binding);
-    }
-
     global_state.ActiveTexture(active_texture);
     global_state.BindFramebuffer(GL_FRAMEBUFFER, framebuffer_binding);
     global_state.BindBuffer(GL_ARRAY_BUFFER, array_buffer_binding);
diff --git a/test/opengl/lut.cpp b/test/opengl/lut.cpp
index 7543db96d80f..d51f7f1f8bf6 100644
--- a/test/opengl/lut.cpp
+++ b/test/opengl/lut.cpp
@@ -67,7 +67,9 @@ int test_lut1d() {
 int main() {
 
     if (test_lut1d() == 0) {
-        printf("PASSED\n");
+        printf("Success!\n");
+    } else {
+        printf("FAILED\n");
     }
 
     return 0;
diff --git a/test/opengl/produce.cpp b/test/opengl/produce.cpp
index d00411642b6e..002f9ec89045 100644
--- a/test/opengl/produce.cpp
+++ b/test/opengl/produce.cpp
@@ -61,7 +61,9 @@ int test_lut1d() {
 int main() {
 
     if (test_lut1d() == 0) {
-        printf("PASSED\n");
+        printf("Success!\n");
+    } else {
+        printf("FAILED\n");
     }
 
     return 0;
diff --git a/test/opengl/save_state.cpp b/test/opengl/save_state.cpp
index c64ad0c63484..574565775728 100644
--- a/test/opengl/save_state.cpp
+++ b/test/opengl/save_state.cpp
@@ -206,6 +206,11 @@ class KnownState {
         }
         glActiveTexture(initial_active_texture = GL_TEXTURE3);
 
+        // Vertex array objects are only used by Halide if the OpenGL version >=3
+        if (gl_major_version >= 3) {
+            glBindVertexArray(initial_vertex_array_binding = gl_gen(glGenVertexArrays));
+        }
+
         for (int i = 0; i < nvertex_attribs; i++) {
             if ((initial_vertex_attrib_array_enabled[i] = boolval)) {
                 glEnableVertexAttribArray(i);
@@ -225,11 +230,6 @@ class KnownState {
         glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, initial_element_array_buffer_binding = gl_gen(glGenBuffers));
         glBindFramebuffer(GL_FRAMEBUFFER, initial_framebuffer_binding = gl_gen(glGenFramebuffers));
 
-        // Vertex array objects are only used by Halide if the OpenGL version >=3
-        if (gl_major_version >= 3) {
-            glBindVertexArray(initial_vertex_array_binding = gl_gen(glGenVertexArrays));
-        }
-
         check_error("known state");
     }
 
diff --git a/test/opengl/shifted_domains.cpp b/test/opengl/shifted_domains.cpp
index 9ebd025c39b9..38e2e81b2771 100644
--- a/test/opengl/shifted_domains.cpp
+++ b/test/opengl/shifted_domains.cpp
@@ -61,6 +61,6 @@ int main() {
         return 1;
     }
 
-    printf("Success\n");
+    printf("Success!\n");
     return 0;
 }
diff --git a/test/opengl/special_funcs.cpp b/test/opengl/special_funcs.cpp
index 5d1640393a15..677bf05a23c0 100644
--- a/test/opengl/special_funcs.cpp
+++ b/test/opengl/special_funcs.cpp
@@ -114,7 +114,7 @@ int main() {
     // The GLSL ES 1.0 spec does not define the precision of these operations
     // so a wide error bound is used in this test.
     Expr r = (256 * x + y) / ceilf(65536.f / (2 * 3.1415926536f));
-    if (!test_approx<float>(sin(r), cos(r), 0, 5e-2)) {
+    if (!test_approx<float>(sin(r), cos(r), 0.0f, 5e-2)) {
         errors++;
         printf("Failed trigonometric test\n");
     }