583 lines
24 KiB
Python
583 lines
24 KiB
Python
import logging
|
|
import typing as tp
|
|
|
|
from httpcore import Request, Response
|
|
|
|
from hishel._headers import Vary, parse_cache_control
|
|
|
|
from ._utils import (
|
|
BaseClock,
|
|
Clock,
|
|
extract_header_values,
|
|
extract_header_values_decoded,
|
|
generate_key,
|
|
get_safe_url,
|
|
header_presents,
|
|
parse_date,
|
|
)
|
|
|
|
logger = logging.getLogger("hishel.controller")
|
|
|
|
HEURISTICALLY_CACHEABLE_STATUS_CODES = (200, 203, 204, 206, 300, 301, 308, 404, 405, 410, 414, 501)
|
|
HTTP_METHODS = ["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"]
|
|
|
|
__all__ = ("Controller", "HEURISTICALLY_CACHEABLE_STATUS_CODES")
|
|
|
|
|
|
def get_updated_headers(
|
|
stored_response_headers: tp.List[tp.Tuple[bytes, bytes]],
|
|
new_response_headers: tp.List[tp.Tuple[bytes, bytes]],
|
|
) -> tp.List[tp.Tuple[bytes, bytes]]:
|
|
updated_headers = []
|
|
|
|
checked = set()
|
|
|
|
for key, value in stored_response_headers:
|
|
if key not in checked and key.lower() != b"content-length":
|
|
checked.add(key)
|
|
values = extract_header_values(new_response_headers, key)
|
|
|
|
if values:
|
|
updated_headers.extend([(key, value) for value in values])
|
|
else:
|
|
values = extract_header_values(stored_response_headers, key)
|
|
updated_headers.extend([(key, value) for value in values])
|
|
|
|
for key, value in new_response_headers:
|
|
if key not in checked and key.lower() != b"content-length":
|
|
values = extract_header_values(new_response_headers, key)
|
|
updated_headers.extend([(key, value) for value in values])
|
|
|
|
return updated_headers
|
|
|
|
|
|
def get_freshness_lifetime(response: Response) -> tp.Optional[int]:
|
|
response_cache_control = parse_cache_control(extract_header_values_decoded(response.headers, b"Cache-Control"))
|
|
|
|
if response_cache_control.max_age is not None:
|
|
return response_cache_control.max_age
|
|
|
|
if header_presents(response.headers, b"expires"):
|
|
expires = extract_header_values_decoded(response.headers, b"expires", single=True)[0]
|
|
expires_timestamp = parse_date(expires)
|
|
if expires_timestamp is None:
|
|
return None
|
|
date = extract_header_values_decoded(response.headers, b"date", single=True)[0]
|
|
date_timestamp = parse_date(date)
|
|
if date_timestamp is None:
|
|
return None
|
|
|
|
return expires_timestamp - date_timestamp
|
|
return None
|
|
|
|
|
|
def get_heuristic_freshness(response: Response, clock: "BaseClock") -> int:
|
|
last_modified = extract_header_values_decoded(response.headers, b"last-modified", single=True)
|
|
|
|
if last_modified:
|
|
last_modified_timestamp = parse_date(last_modified[0])
|
|
if last_modified_timestamp is not None:
|
|
now = clock.now()
|
|
|
|
ONE_WEEK = 604_800
|
|
|
|
return min(ONE_WEEK, int((now - last_modified_timestamp) * 0.1))
|
|
|
|
ONE_DAY = 86_400
|
|
return ONE_DAY
|
|
|
|
|
|
def get_age(response: Response, clock: "BaseClock") -> int:
|
|
if not header_presents(response.headers, b"date"):
|
|
# If the response does not have a date header, then it is impossible to calculate the age.
|
|
# Instead of raising an exception, we return infinity to be sure that the response is not considered fresh.
|
|
return float("inf") # type: ignore
|
|
|
|
date = parse_date(extract_header_values_decoded(response.headers, b"date")[0])
|
|
if date is None:
|
|
return float("inf") # type: ignore
|
|
|
|
now = clock.now()
|
|
|
|
apparent_age = max(0, now - date)
|
|
return int(apparent_age)
|
|
|
|
|
|
def allowed_stale(response: Response) -> bool:
|
|
response_cache_control = parse_cache_control(extract_header_values_decoded(response.headers, b"Cache-Control"))
|
|
|
|
if response_cache_control.no_cache:
|
|
return False
|
|
|
|
if response_cache_control.must_revalidate:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
class Controller:
|
|
def __init__(
|
|
self,
|
|
cacheable_methods: tp.Optional[tp.List[str]] = None,
|
|
cacheable_status_codes: tp.Optional[tp.List[int]] = None,
|
|
cache_private: bool = True,
|
|
allow_heuristics: bool = False,
|
|
clock: tp.Optional[BaseClock] = None,
|
|
allow_stale: bool = False,
|
|
always_revalidate: bool = False,
|
|
force_cache: bool = False,
|
|
key_generator: tp.Optional[tp.Callable[[Request, tp.Optional[bytes]], str]] = None,
|
|
):
|
|
self._cacheable_methods = []
|
|
|
|
if cacheable_methods is None:
|
|
self._cacheable_methods.append("GET")
|
|
else:
|
|
for method in cacheable_methods:
|
|
if method.upper() not in HTTP_METHODS:
|
|
raise RuntimeError(
|
|
f"Hishel does not support the HTTP method `{method}`.\n"
|
|
f"Please use the methods from this list: {HTTP_METHODS}"
|
|
)
|
|
self._cacheable_methods.append(method.upper())
|
|
|
|
self._cacheable_status_codes = cacheable_status_codes if cacheable_status_codes else [200, 301, 308]
|
|
self._cache_private = cache_private
|
|
self._clock = clock if clock else Clock()
|
|
self._allow_heuristics = allow_heuristics
|
|
self._allow_stale = allow_stale
|
|
self._always_revalidate = always_revalidate
|
|
self._force_cache = force_cache
|
|
self._key_generator = key_generator or generate_key
|
|
|
|
def is_cachable(self, request: Request, response: Response) -> bool:
|
|
"""
|
|
Determines whether the response may be cached.
|
|
|
|
The only thing this method does is determine whether the
|
|
response associated with this request can be cached for later use.
|
|
`https://www.rfc-editor.org/rfc/rfc9111.html#name-storing-responses-in-caches`
|
|
lists the steps that this method simply follows.
|
|
"""
|
|
method = request.method.decode("ascii")
|
|
force_cache = request.extensions.get("force_cache", None)
|
|
|
|
if response.status not in self._cacheable_status_codes:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
f"as not cachable since its status code ({response.status})"
|
|
" is not in the list of cacheable status codes."
|
|
)
|
|
)
|
|
return False
|
|
|
|
if response.status in (301, 308):
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as cachable since its status code is a permanent redirect."
|
|
)
|
|
)
|
|
return True
|
|
|
|
# the request method is understood by the cache
|
|
if method not in self._cacheable_methods:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
f"as not cachable since the request method ({method}) is not in the list of cacheable methods."
|
|
)
|
|
)
|
|
return False
|
|
|
|
if force_cache if force_cache is not None else self._force_cache:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as cachable since the request is forced to use the cache."
|
|
)
|
|
)
|
|
return True
|
|
|
|
response_cache_control = parse_cache_control(extract_header_values_decoded(response.headers, b"cache-control"))
|
|
request_cache_control = parse_cache_control(extract_header_values_decoded(request.headers, b"cache-control"))
|
|
|
|
# the response status code is final
|
|
if response.status // 100 == 1:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as not cachable since its status code is informational."
|
|
)
|
|
)
|
|
return False
|
|
|
|
# the no-store cache directive is not present (see Section 5.2.2.5)
|
|
if request_cache_control.no_store:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as not cachable since the request contains the no-store directive."
|
|
)
|
|
)
|
|
return False
|
|
|
|
# note that the must-understand cache directive overrides
|
|
# no-store in certain circumstances; see Section 5.2.2.3.
|
|
if response_cache_control.no_store:
|
|
if response_cache_control.must_understand:
|
|
logger.debug(
|
|
(
|
|
f"Skipping the no-store directive for the resource located at {get_safe_url(request.url)} "
|
|
"since the response contains the must-understand directive."
|
|
)
|
|
)
|
|
else:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as not cachable since the response contains the no-store directive."
|
|
)
|
|
)
|
|
return False
|
|
|
|
# a shared cache must not store a response with private directive
|
|
# Note that we do not implement special handling for the qualified form,
|
|
# which would only forbid storing specified headers.
|
|
if not self._cache_private and response_cache_control.private:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as not cachable since the response contains the private directive."
|
|
)
|
|
)
|
|
return False
|
|
|
|
expires_presents = header_presents(response.headers, b"expires")
|
|
# the response contains at least one of the following:
|
|
# - a public response directive (see Section 5.2.2.9);
|
|
# - a private response directive, if the cache is not shared (see Section 5.2.2.7);
|
|
# - an Expires header field (see Section 5.3);
|
|
# - a max-age response directive (see Section 5.2.2.1);
|
|
# - if the cache is shared: an s-maxage response directive (see Section 5.2.2.10);
|
|
# - a cache extension that allows it to be cached (see Section 5.2.3); or
|
|
# - a status code that is defined as heuristically cacheable (see Section 4.2.2).
|
|
if self._allow_heuristics and response.status in HEURISTICALLY_CACHEABLE_STATUS_CODES:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as cachable since its status code is heuristically cacheable."
|
|
)
|
|
)
|
|
return True
|
|
|
|
if not any(
|
|
[
|
|
response_cache_control.public,
|
|
response_cache_control.private,
|
|
expires_presents,
|
|
response_cache_control.max_age is not None,
|
|
]
|
|
):
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as not cachable since it does not contain any of the required cache directives."
|
|
)
|
|
)
|
|
return False
|
|
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as cachable since it meets the criteria for being stored in the cache."
|
|
)
|
|
)
|
|
# response is a cachable!
|
|
return True
|
|
|
|
def _make_request_conditional(self, request: Request, response: Response) -> None:
|
|
"""
|
|
Adds the precondition headers needed for response validation.
|
|
|
|
This method will use the "Last-Modified" or "Etag" headers
|
|
if they are provided in order to create precondition headers.
|
|
|
|
See also (https://www.rfc-editor.org/rfc/rfc9111.html#name-sending-a-validation-reques)
|
|
"""
|
|
|
|
if header_presents(response.headers, b"last-modified"):
|
|
last_modified = extract_header_values(response.headers, b"last-modified", single=True)[0]
|
|
logger.debug(
|
|
(
|
|
f"Adding the 'If-Modified-Since' header with the value of '{last_modified.decode('ascii')}' "
|
|
f"to the request for the resource located at {get_safe_url(request.url)}."
|
|
)
|
|
)
|
|
else:
|
|
last_modified = None
|
|
|
|
if header_presents(response.headers, b"etag"):
|
|
etag = extract_header_values(response.headers, b"etag", single=True)[0]
|
|
logger.debug(
|
|
(
|
|
f"Adding the 'If-None-Match' header with the value of '{etag.decode('ascii')}' "
|
|
f"to the request for the resource located at {get_safe_url(request.url)}."
|
|
)
|
|
)
|
|
else:
|
|
etag = None
|
|
|
|
precondition_headers: tp.List[tp.Tuple[bytes, bytes]] = []
|
|
if last_modified:
|
|
precondition_headers.append((b"If-Modified-Since", last_modified))
|
|
if etag:
|
|
precondition_headers.append((b"If-None-Match", etag))
|
|
|
|
request.headers.extend(precondition_headers)
|
|
|
|
def _validate_vary(self, request: Request, response: Response, original_request: Request) -> bool:
|
|
"""
|
|
Determines whether the "vary" headers in the request and response headers are identical.
|
|
|
|
See also (https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-cache-keys-with).
|
|
"""
|
|
|
|
vary_headers = extract_header_values_decoded(response.headers, b"vary")
|
|
vary = Vary.from_value(vary_values=vary_headers)
|
|
for vary_header in vary._values:
|
|
if vary_header == "*":
|
|
return False # pragma: no cover
|
|
|
|
if extract_header_values(request.headers, vary_header) != extract_header_values(
|
|
original_request.headers, vary_header
|
|
):
|
|
return False
|
|
|
|
return True
|
|
|
|
def construct_response_from_cache(
|
|
self, request: Request, response: Response, original_request: Request
|
|
) -> tp.Union[Response, Request, None]:
|
|
"""
|
|
Specifies whether the response should be used, skipped, or validated by the cache.
|
|
|
|
This method makes a decision regarding what to do with
|
|
the stored response when it is retrieved from storage.
|
|
It might be ready for use or it might need to be revalidated.
|
|
This method mirrors the relevant section from RFC 9111,
|
|
see (https://www.rfc-editor.org/rfc/rfc9111.html#name-constructing-responses-from).
|
|
|
|
Returns:
|
|
Response: This response is applicable to the request.
|
|
Request: This response can be used for this request, but it must first be revalidated.
|
|
None: It is not possible to use this response for this request.
|
|
"""
|
|
|
|
# Use of responses with status codes 301 and 308 is always
|
|
# legal as long as they don't adhere to any caching rules.
|
|
if response.status in (301, 308):
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as valid for cache use since its status code is a permanent redirect."
|
|
)
|
|
)
|
|
return response
|
|
|
|
response_cache_control = parse_cache_control(extract_header_values_decoded(response.headers, b"Cache-Control"))
|
|
request_cache_control = parse_cache_control(extract_header_values_decoded(request.headers, b"Cache-Control"))
|
|
|
|
# request header fields nominated by the stored
|
|
# response (if any) match those presented (see Section 4.1)
|
|
if not self._validate_vary(request=request, response=response, original_request=original_request):
|
|
# If the vary headers does not match, then do not use the response
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as invalid for cache use since the vary headers do not match."
|
|
)
|
|
)
|
|
return None # pragma: no cover
|
|
|
|
# !!! this should be after the "vary" header validation.
|
|
force_cache = request.extensions.get("force_cache", None)
|
|
if force_cache if force_cache is not None else self._force_cache:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as valid for cache use since the request is forced to use the cache."
|
|
)
|
|
)
|
|
return response
|
|
|
|
# the stored response does not contain the
|
|
# no-cache directive (Section 5.2.2.4), unless
|
|
# it is successfully validated (Section 4.3)
|
|
if (
|
|
self._always_revalidate
|
|
or response_cache_control.no_cache
|
|
or response_cache_control.must_revalidate
|
|
or request_cache_control.no_cache
|
|
):
|
|
if self._always_revalidate:
|
|
log_text = (
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as needing revalidation since the cache is set to always revalidate."
|
|
)
|
|
elif response_cache_control.no_cache:
|
|
log_text = (
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as needing revalidation since the response contains the no-cache directive."
|
|
)
|
|
elif response_cache_control.must_revalidate:
|
|
log_text = (
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as needing revalidation since the response contains the must-revalidate directive."
|
|
)
|
|
elif request_cache_control.no_cache:
|
|
log_text = (
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as needing revalidation since the request contains the no-cache directive."
|
|
)
|
|
else:
|
|
assert False, "Unreachable code " # pragma: no cover
|
|
logger.debug(log_text)
|
|
self._make_request_conditional(request=request, response=response)
|
|
return request
|
|
|
|
freshness_lifetime = get_freshness_lifetime(response)
|
|
|
|
if freshness_lifetime is None:
|
|
logger.debug(
|
|
(
|
|
"Could not determine the freshness lifetime of "
|
|
f"the resource located at {get_safe_url(request.url)}, "
|
|
"trying to use heuristics to calculate it."
|
|
)
|
|
)
|
|
if self._allow_heuristics and response.status in HEURISTICALLY_CACHEABLE_STATUS_CODES:
|
|
freshness_lifetime = get_heuristic_freshness(response=response, clock=self._clock)
|
|
logger.debug(
|
|
(
|
|
f"Successfully calculated the freshness lifetime of the resource located at "
|
|
f"{get_safe_url(request.url)} using heuristics."
|
|
)
|
|
)
|
|
else:
|
|
logger.debug(
|
|
(
|
|
"Could not calculate the freshness lifetime of "
|
|
f"the resource located at {get_safe_url(request.url)}. "
|
|
"Making a conditional request to revalidate the response."
|
|
)
|
|
)
|
|
# If Freshness cannot be calculated, then send the request
|
|
self._make_request_conditional(request=request, response=response)
|
|
return request
|
|
|
|
age = get_age(response, self._clock)
|
|
is_fresh = freshness_lifetime > age
|
|
|
|
# The min-fresh request directive indicates that the client
|
|
# prefers a response whose freshness lifetime is no less than
|
|
# its current age plus the specified time in seconds.
|
|
# That is, the client wants a response that will still
|
|
# be fresh for at least the specified number of seconds.
|
|
if request_cache_control.min_fresh is not None:
|
|
if freshness_lifetime < (age + request_cache_control.min_fresh):
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as invalid for cache use since the time left for "
|
|
"freshness is less than the min-fresh directive."
|
|
)
|
|
)
|
|
return None
|
|
|
|
# The max-stale request directive indicates that the
|
|
# client will accept a response that has exceeded its freshness lifetime.
|
|
# If a value is present, then the client is willing to accept a response
|
|
# that has exceeded its freshness lifetime by no more than the specified
|
|
# number of seconds. If no value is assigned to max-stale, then
|
|
# the client will accept a stale response of any age.
|
|
if not is_fresh and request_cache_control.max_stale is not None:
|
|
exceeded_freshness_lifetime = age - freshness_lifetime
|
|
|
|
if request_cache_control.max_stale < exceeded_freshness_lifetime:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as invalid for cache use since the freshness lifetime has been exceeded more than max-stale."
|
|
)
|
|
)
|
|
return None
|
|
else:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as valid for cache use since the freshness lifetime has been exceeded less than max-stale."
|
|
)
|
|
)
|
|
return response
|
|
|
|
# The max-age request directive indicates that
|
|
# the client prefers a response whose age is
|
|
# less than or equal to the specified number of seconds.
|
|
# Unless the max-stale request directive is also present,
|
|
# the client does not wish to receive a stale response.
|
|
if request_cache_control.max_age is not None:
|
|
if request_cache_control.max_age < age:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as invalid for cache use since the age of the response exceeds the max-age directive."
|
|
)
|
|
)
|
|
return None
|
|
|
|
# the stored response is one of the following:
|
|
# fresh (see Section 4.2), or
|
|
# allowed to be served stale (see Section 4.2.4), or
|
|
# successfully validated (see Section 4.3).
|
|
if is_fresh:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as valid for cache use since it is fresh."
|
|
)
|
|
)
|
|
return response
|
|
else:
|
|
logger.debug(
|
|
(
|
|
f"Considering the resource located at {get_safe_url(request.url)} "
|
|
"as needing revalidation since it is not fresh."
|
|
)
|
|
)
|
|
# Otherwise, make a conditional request
|
|
self._make_request_conditional(request=request, response=response)
|
|
return request
|
|
|
|
def handle_validation_response(self, old_response: Response, new_response: Response) -> Response:
|
|
"""
|
|
Handles incoming validation response.
|
|
|
|
This method takes care of what to do with the incoming
|
|
validation response; if it is a 304 response, it updates
|
|
the headers with the new response and returns it.
|
|
|
|
This method mirrors the relevant section from RFC 9111,
|
|
see (https://www.rfc-editor.org/rfc/rfc9111.html#name-handling-a-validation-respo).
|
|
"""
|
|
if new_response.status == 304:
|
|
headers = get_updated_headers(
|
|
stored_response_headers=old_response.headers,
|
|
new_response_headers=new_response.headers,
|
|
)
|
|
old_response.headers = headers
|
|
return old_response
|
|
else:
|
|
return new_response
|