100) die("Invalid maxReturned value, must be between 1 and 100 !");
if(!is_dir(__DIR__ . "/archive/$blog")) mkdir(__DIR__ . "/archive/$blog");
// First execution of the function, starting at the current date
if(!file_exists(__DIR__ . "/archive/$blog/autoOldest")) {
// The PHP timestamp is in second while the Kinja one is in milliseconds, adding 4 zeroes by multiplying
$oldest = time()*1000;
file_put_contents(__DIR__ . "/archive/$blog/autoNewest", $oldest);
} else {
// Getting the date of the last fetched post
$oldest = (int) trim(file_get_contents(__DIR__ . "/archive/$blog/autoOldest"));
if($oldest == -1) die("All posts older than ".trim(file_get_contents(__DIR__ . "/archive/$blog/autoNewest"))." have already been retrieved for $blog !");
if($oldest < 1000000000000) die("Invalid timestamps on ". __DIR__ . "/archive/$blog/autoOldest ($oldest) !");
}
if(!is_int($oldest)) die("Invalid autoOldest data ($oldest) for $blog !");
// The URL is not the same between blogs and users posts (if there is a "/" after the login for users, it does not work)
if(!$user) $url = "https://${blog}.kinja.com/?startTime=${oldest}&maxReturned=${maxReturned}";
// TODO: startTime does not work for users...
else $url = "https://kinja.com/${blog}?startTime=${oldest}&maxReturned=${maxReturned}";
// The blogs mainpages can start with posts after a certain timestamp by passing the GET parameter "startTime"
if($newOldest = fetchArticlesFromPage($url, max($count, 1))) {
if(!is_numeric($newOldest)) die("Invalid newOldest timestamp($newOldest) for $blog !");
if($newOldest > $oldest) die("The new oldest timestamp is newer than the previous !");
// $count articles have been archived (or all the available ones have already been archived), saving the timestamp to start at for the next execution of the script
file_put_contents(__DIR__ . "/archive/$blog/autoOldest", $newOldest);
} else die("Error calling fetchArticlesFromPage() !");
}
// Used to fetch all articles listed on a page such as a blog mainpage or a user posts page ; also saves the information that are in JSON on the js_editor-tools div
// $count is the maximum number of articles to fetch ; 0 being "unlimited" (limited by what is listed on the request page)
function fetchArticlesFromPage($url, $count = 0) {
// Checking that the URL that has been passed is a valid Kinja "mainpage" (or user page) URL
if(!filter_var($url, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://([A-z0-9]{2,50}\.kinja\.com(/?|/tag/[A-z0-9-_]+|/search/?\?[A-z0-9-_&=%]+$)|kinja\.com/([A-z0-9-_]+))(/?\?.+)?$#i', $url)) {
die("Invalid mainpage URL ($url) !");
}
// Using DOMDocument to extract only the post content of the page (not the header, footer, "You may also like", Comments section and list of other posts ; also used to fix src tags
$mainpage = new DOMDocument;
// To avoid warnings for non standard tags on the HTML code
$mainpage->strictErrorChecking = false;
$mainpage->recover = true;
// Fetching the content from the page on Kinja
if(!@$mainpage->loadHTMLFile($url)) die("Could not retrieve the page content");
$xpath = new DOMXPath($mainpage);
//$mainpageHead = $xpath->query("/html/head")->item(0);
$articles = $xpath->query("/html/body//article");
if(count($articles) == 0) die("No article found on $url !");
$i = 0;
// Loop on each on the page
foreach($articles as $article) {
$postId = $article->getAttribute("data-id");
if(empty($postId)) die("Could not determine the post Id !");
// Extracting the JSON on the div js_editor-tools that contains the URL to the post (postPermalink)
$postToolsWrapperJSON = $xpath->query(".//div[@class='js_editor-tools sc-1i9kufk-0 ixGbux']", $article)->item(0)->getAttribute("data-state");
$postToolsWrapper = json_decode($postToolsWrapperJSON);
//$postId = $postToolsWrapper->postId;
$postPermalink = $postToolsWrapper->postPermalink;
if(preg_match("#^https://([^\.]+)\.kinja\.com/#", $postPermalink, $blog)) $blog = $blog[1];
else {
// This is a link "shared" from one of the main websites such as Jalopnik, no need to save this one
echo("
Could not determine the blog from the Permalink ($postPermalink) on $url, this article is not backuped !
\n";
continue;
} else {
// This article can an will be archived, creating a lockfile to avoid having two instance of the script fetching the same article at the same time and the lock file is kept if the operation failed making it easier to retrieve problematic articles
file_put_contents("fetchArticle.$postId.lock", getmypid());
if(fetchArticle($postPermalink)) $i++;
}
if(!file_exists($archivePath . "js_editor-tools.txt")) file_put_contents($archivePath . "js_editor-tools.txt", $postToolsWrapperJSON);
/* if(!file_exists($archivePath . "datetime.txt")) file_put_contents($archivePath . "datetime.txt", $xpath->query(".//time", $article)->item(0)->getAttribute("datetime"));
if(!file_exists($archivePath . "thumbnail")) {
$thumbnailId = $xpath->query(".//div[@class='sc-1xh12qx-2 gmDVKF js_lazy-image']//img", $article)->item(0)->getAttribute("data-chomp-id");
file_put_contents($archivePath . "thumbnail", file_get_contents("https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".$thumbnailId.".jpg")) or die("Could not retrieve the thumbnail for the article $postId !");
}*/
if($count > 0 && $i >= $count) {
// $count articles have been archived, returning the UNIX Timestamp (with 4 more zeroes) of the current article
return strtotime($xpath->query(".//time", $article)->item(0)->getAttribute("datetime"))*1000;
}
}
// $count articles have been archived, returning the UNIX Timestamp of the last one on the page (which is the oldest)
if($count > 0) {
// All the articles on the current page have already been archived, returning the timestamp of the "Next Page" button
$nextPageTimestamp = $xpath->query(".//div[@class='sc-1uzyw0z-0 kNHeFZ']//a[@class='sc-1out364-0 hMndXN js_link']")->item(0);
// If there is no link to a next page... then we are already on it, returning -1 means that all "old" posts have been fetched for this "blog"
if(empty($nextPageTimestamp)) return -1;
$nextPageTimestamp = $nextPageTimestamp->getAttribute("href") or die("Could not determine the timestamp for the next page !");
if(substr($nextPageTimestamp, 11) == "?startTime=") {
// Removing "?startTime=" from the href... if it's a mainpage
return substr($nextPageTimestamp, 11);
} else {
// User posts list pages have ?startIndex= on their next page links so we have to determine the timestamp of the last article listed on the page and decrement it of 1
return (strtotime($xpath->query(".//time", $article)->item(0)->getAttribute("datetime")) - 1) * 1000;
}
}
}
// Fetches the article, extract useful informations to only keep the article content itself, downloads localy a copy of the medias (videos and images) and make the article "static" (no JS or CSS or media fetched from Kinja) ; fetches the thumbnail (used on mainpages/users posts pages)
function fetchArticle($url) {
// Checking the the post URL that has been passed as the "article" GET variable is a valid Kinja URL
if(is_numeric($url)) $url = "https://oppositelock.kinja.com/$url";
if(!filter_var($url, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://([A-z0-9-]{2,50})\.kinja\.com/([A-z0-9-_]+)$#i', $url, $article)) {
die("Invalid article URL ($url) !");
}
// $blog = $article[1];
$articleURI = $article[2];
// Using DOMDocument to extract only the post content of the page (not the header, footer, "You may also like", Comments section and list of other posts ; also used to fix src tags
$article = new DOMDocument;
// To avoid warnings for non standard tags on the HTML code
$article->strictErrorChecking = false;
$article->recover = true;
// Fetching the content from the page on Kinja
if(!@$article->loadHTMLFile($url)) die("Could not retrieve the page content");
//$articleHead = $dom->getElementsByTagName("head")->item(0);
// Getting the publish-date of the page
$xpath = new DOMXPath($article);
$articleHead = $xpath->query("/html/head")->item(0);
// The date the article has been published
// $publishDate = $xpath->query("./meta[@name='publish-date']", $articleHead)->item(0)->getAttribute("content") or die("Could not retrieve the publish-date");
// Converting the date string
// $publishDate = strtotime($publishDate);
// Getting the title of the page ( tag)
$pageTitle = $xpath->query("./title", $articleHead)->item(0)->nodeValue;
// Getting the title of the post ( tag)
if($postTitle = $xpath->query("/html/body//header")->item(0)) {
// Some articles dont have a title and thus don't have the tag
if(!$postTitle = $postTitle->C14N()) die("Could not find the with the title !");
}
// The $url can sometimes redirect to another blog, we fetch the "right" URL inside the "canonical" to extract the right blog name of this article
$postPermalink = $xpath->query("./link[@rel='canonical']", $articleHead)->item(0)->getAttribute("href") or die("Could not retrieve the canonical URL of this article ($url) !");
if(preg_match("#^https://([^\.]+)\.kinja\.com/#", $postPermalink, $blog)) $blog = $blog[1];
else die("
Could not determine the blog from the Permalink ($postPermalink) on $url, this article is not backuped !
\n");
// The interesting content is in the "js_starterpost"
if(!$starterPost = $xpath->query("/html/body//div[@class='js_starterpost']")->item(0)) die("Could not find the 'js_starterpost' div !");
// Extracting the JSON on the div post-tools-wrapper that contains the postId (the unique numeric identfier of each posts)
$toolsDiv = $xpath->query(".//div[@class='sc-83o472-2 hNtXBx']", $starterpost)->item(0);
$postToolsWrapper = json_decode($xpath->query(".//div[@class='post-tools-wrapper']", $toolsDiv)->item(0)->getAttribute("data-state"));
$postId = $postToolsWrapper->postId;
if(empty($postId)) die("Could not determine the post Id !");
// Removal of the "js_share-tools"