-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathseo4ajax.go
246 lines (217 loc) · 6.97 KB
/
seo4ajax.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/*
Package seo4ajax provides a library for accessing the SEO4Ajax prerender service.
Before using, you need to set ServerIp to a valid IP address.
*/
package seo4ajax
import (
"errors"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/cenkalti/backoff"
"github.com/go-kit/kit/log"
)
var (
// ErrNoToken is returned when the client isn't provided a API token
ErrNoToken = errors.New("no token given")
// ErrCacheMiss happens if seo4ajax responded with a cache miss
ErrCacheMiss = errors.New("cache miss from seo4ajax")
// ErrUnknownStatus represents an unknown status code
ErrUnknownStatus = errors.New("Unknown Status Code")
errRedirect = errors.New("SEO4AJAX: do not follow redirect")
regexInvalidUserAgent = regexp.MustCompile(`(?i:bing|msnbot|yandexbot|pinterest.*ios|mail\.ru)`)
regexValidUserAgent = regexp.MustCompile(`(?i:bot|google|crawler|spider|archiver|pinterest|facebookexternalhit|flipboardproxy)`)
regexFilePath = regexp.MustCompile(`.*(\.[^?]{2,4}$|\.[^?]{2,4}?.*)`)
regexIndexHTML = regexp.MustCompile(`/index\.html?`)
)
// Config is the Seo4Ajax Client config
type Config struct {
Log log.Logger
Next http.Handler
Transport http.RoundTripper
Server string // seo4ajax api server, defaults to http://api.seo4ajax.com
Token string // seo4ajax token, must be set
IP string // server IP, defaults to 127.0.0.1
Timeout time.Duration // retry timeout, defaults to 30s
// s4a supports client side caching and returns an empty 304 if the content hasn't changed.
// If UnconditionalFetch set to true the client side caching headers (If-Modified-Since and If-None-Match)
// are removed
UnconditionalFetch bool
// FetchErrorStatus is the http status code returned if the fetch from seo4ajax fails
FetchErrorStatus int
// FetchTimeout is the http timeout for a single fetch attempt
FetchTimeout time.Duration
// RetryUnavailable advises the retry loop to retry a fetch on 503 upstream results until success or Timeout
RetryUnavailable bool
}
// Client is the Seo4Ajax Client
type Client struct {
log log.Logger
next http.Handler
server string
token string
ip string
timeout time.Duration
http *http.Client
unconditionalFetch bool
fetchErrorStatus int
retryUnavailable bool
}
// New creates a new Seo4Ajax client. Returns an error if no token is provided
func New(cfg Config) (*Client, error) {
if cfg.Log == nil {
cfg.Log = log.NewNopLogger()
}
if cfg.Server == "" {
cfg.Server = "http://api.seo4ajax.com"
}
if cfg.Token == "" {
return nil, ErrNoToken
}
if cfg.IP == "" {
cfg.IP = "127.0.0.1"
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport
}
if cfg.FetchErrorStatus == 0 {
cfg.FetchErrorStatus = http.StatusServiceUnavailable
}
c := &Client{
log: cfg.Log,
server: cfg.Server,
token: cfg.Token,
ip: cfg.IP,
timeout: cfg.Timeout,
next: cfg.Next,
unconditionalFetch: cfg.UnconditionalFetch,
fetchErrorStatus: cfg.FetchErrorStatus,
retryUnavailable: cfg.RetryUnavailable,
}
c.http = &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return errRedirect
},
Transport: cfg.Transport,
}
if cfg.FetchTimeout > 0 {
c.http.Timeout = cfg.FetchTimeout
}
return c, nil
}
// IsPrerender returns true, when Seo4Ajax shall be used for the given http Request.
// The logic is taken from https://github.com/seo4ajax/connect-s4a/blob/master/lib/connect-s4a.js
func IsPrerender(r *http.Request) bool {
if r.Method != "GET" && r.Method != "HEAD" {
return false
}
if strings.Contains(r.URL.RawQuery, "_escaped_fragment_") {
return true
}
if regexInvalidUserAgent.MatchString(r.Header.Get("User-Agent")) {
return false
}
if !regexIndexHTML.MatchString(r.URL.Path) && regexFilePath.MatchString(r.URL.Path) {
return false
}
return regexValidUserAgent.MatchString(r.Header.Get("User-Agent"))
}
// ServeHTTP will serve the prerendered page if this is a prerender request.
// If no upstream handler is set it will return an error. Otherwise it will
// just invoke the upstream handler. This way it can be either used as an
// HTTP middleware intercepting any prerender requests or an regular HTTP
// handler (if next is nil) to serve only prerender request
func (c *Client) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if IsPrerender(r) {
c.GetPrerenderedPage(w, r)
return
}
if c.next == nil {
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
return
}
c.next.ServeHTTP(w, r)
return
}
// GetPrerenderedPage returns the prerendered html from the seo4ajax api
func (c *Client) GetPrerenderedPage(w http.ResponseWriter, r *http.Request) {
var outputStarted bool
opFunc := func() error {
req, err := http.NewRequest("GET", fmt.Sprintf("%s/%s%s", c.server, c.token, cleanPath(r.URL)), nil)
if err != nil {
return err
}
req.Header = r.Header
ips := []string{c.ip}
if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
ips = append(ips, xff)
}
req.Header.Set("X-Forwarded-For", strings.Join(ips, ", "))
if c.unconditionalFetch {
req.Header.Del("If-Modified-Since")
req.Header.Del("If-None-Match")
}
resp, err := c.http.Do(req)
if err != nil && !strings.HasSuffix(err.Error(), errRedirect.Error()) {
return err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusFound {
http.Redirect(w, r, resp.Header.Get("Location"), http.StatusFound)
return nil
}
// conditionally terminate retry loop if the status code is 503 or 404
if !c.retryUnavailable {
if resp.StatusCode == http.StatusServiceUnavailable {
return backoff.Permanent(errors.New("page not yet rendered"))
}
if resp.StatusCode == http.StatusNotFound {
return backoff.Permanent(errors.New("page not found"))
}
}
if resp.StatusCode != http.StatusOK {
// retry
return fmt.Errorf("expected 200 status code, got %d", resp.StatusCode)
}
for header, val := range resp.Header {
w.Header()[header] = val
}
outputStarted = true
// as soon as we start writing the body we must return nil, otherwise we'll
// mess up the HTTP response by calling response.WriteHeader multiple times
_, err = io.Copy(w, resp.Body)
return err
}
bo := backoff.NewExponentialBackOff()
bo.InitialInterval = 50 * time.Millisecond
bo.MaxInterval = 30 * time.Second
if c.timeout > 0 {
bo.MaxElapsedTime = c.timeout
}
err := backoff.Retry(opFunc, bo)
if err != nil {
c.log.Log("level", "warn", "msg", "Upstream request failed", "err", err, "path", r.URL.Path)
if !outputStarted {
http.Error(w, "Upstream error", c.fetchErrorStatus)
return
}
}
return
}
func cleanPath(u *url.URL) string {
cpy := *u
if len(cpy.Path) == 0 {
cpy.Path = "/"
} else if cpy.Path[0] != '/' {
cpy.Path = "/" + cpy.Path
}
cpy.Scheme = ""
cpy.Opaque = ""
cpy.User = nil
cpy.Host = ""
return cpy.String()
}