<?php
// Extracts the content part of a Kinja page with "static" CSS and fetch the important informations in variables then archive the files locally
// The full article URL should be passed as the "article" GET argument

// BETA State : At the moment, batch extraction is only for the articles listed in one page (such as the mainpage of a blog or a user posts page)

die("404 on Kinja made this script useless.");

date_default_timezone_set("America/New_York");

// Parameters passed by GET
if(isset($_GET["update"]) && $_GET["update"])	define("forceUpdate", True);
else						define("forceUpdate", False);

if(isset($_GET["count"]) && is_numeric($_GET["count"]))	$count = (int) $_GET["count"];
else							$count = 0;

if(isset($_GET["maxReturned"]) && is_numeric($_GET["maxReturned"]))	$maxReturned = (int) $_GET["maxReturned"];
else									$maxReturned = 20;

if(isset($_GET["article"]))	fetchArticle($_GET["article"]);
elseif(isset($_GET["page"]))	fetchArticlesFromPage($_GET["page"], $count);
elseif(isset($_GET["blog"]))	autoOldest($_GET["blog"], $count, False, $maxReturned);
//elseif(isset($_GET["user"]))	autoOldest($_GET["user"], $count, True);

// Automatically fetch $count older posts than the last ones fetched for this blog/user
function autoOldest($blog, $count = 1, $user = False, $maxReturned = 20) {
	if(!preg_match("/^[A-z0-9-_]{2,50}$/", $blog)) die("Invalid blog name ($blog) !");
	if(!is_int($maxReturned) || $maxReturned < 1 || $maxReturned > 100) die("Invalid maxReturned value, must be between 1 and 100 !");

	if(!is_dir(__DIR__ . "/archive/$blog")) mkdir(__DIR__ . "/archive/$blog");

	// First execution of the function, starting at the current date
	if(!file_exists(__DIR__ . "/archive/$blog/autoOldest")) {
		// The PHP timestamp is in second while the Kinja one is in milliseconds, adding 4 zeroes by multiplying
		$oldest = time()*1000;
		file_put_contents(__DIR__ . "/archive/$blog/autoNewest", $oldest);
	} else {
		// Getting the date of the last fetched post
		$oldest = (int) trim(file_get_contents(__DIR__ . "/archive/$blog/autoOldest"));
		if($oldest == -1) die("All posts older than ".trim(file_get_contents(__DIR__ . "/archive/$blog/autoNewest"))." have already been retrieved for $blog !");
		if($oldest < 1000000000000) die("Invalid timestamps on ". __DIR__ . "/archive/$blog/autoOldest ($oldest) !");
	}

	if(!is_int($oldest)) die("Invalid autoOldest data ($oldest) for $blog !");

	// The URL is not the same between blogs and users posts (if there is a "/" after the login for users, it does not work)
	if(!$user)	$url = "https://${blog}.kinja.com/?startTime=${oldest}&maxReturned=${maxReturned}";
// TODO: startTime does not work for users...
	else		$url = "https://kinja.com/${blog}?startTime=${oldest}&maxReturned=${maxReturned}";

	// The blogs mainpages can start with posts after a certain timestamp by passing the GET parameter "startTime"
	if($newOldest = fetchArticlesFromPage($url, max($count, 1))) {
		if(!is_numeric($newOldest)) die("Invalid newOldest timestamp($newOldest) for $blog !");
		if($newOldest > $oldest) die("The new oldest timestamp is newer than the previous !");
		// $count articles have been archived (or all the available ones have already been archived), saving the timestamp to start at for the next execution of the script
		file_put_contents(__DIR__ . "/archive/$blog/autoOldest", $newOldest);
	} else die("Error calling fetchArticlesFromPage() !");
}

// Used to fetch all articles listed on a page such as a blog mainpage or a user posts page ; also saves the information that are in JSON on the js_editor-tools div
// $count is the maximum number of articles to fetch ; 0 being "unlimited" (limited by what is listed on the request page)
function fetchArticlesFromPage($url, $count = 0) {
	// Checking that the URL that has been passed is a valid Kinja "mainpage" (or user page) URL
	if(!filter_var($url, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://([A-z0-9]{2,50}\.kinja\.com(/?|/tag/[A-z0-9-_]+|/search/?\?[A-z0-9-_&=%]+$)|kinja\.com/([A-z0-9-_]+))(/?\?.+)?$#i', $url)) {
		die("Invalid mainpage URL ($url) !");
	}

	// Using DOMDocument to extract only the post content of the page (not the header, footer, "You may also like", Comments section and list of other posts ; also used to fix <img> src tags
	$mainpage = new DOMDocument;

	// To avoid warnings for non standard tags on the HTML code
	$mainpage->strictErrorChecking = false;
	$mainpage->recover = true;

	// Fetching the content from the page on Kinja
	if(!@$mainpage->loadHTMLFile($url)) die("Could not retrieve the page content");
	$xpath = new DOMXPath($mainpage);

	//$mainpageHead = $xpath->query("/html/head")->item(0);

	$articles = $xpath->query("/html/body//article");

	if(count($articles) == 0) die("No article found on $url !");

	$i = 0;
	// Loop on each <article> on the page
	foreach($articles as $article) {
		$postId = $article->getAttribute("data-id");
		if(empty($postId)) die("Could not determine the post Id !");

		// Extracting the JSON on the div js_editor-tools that contains the URL to the post (postPermalink)
		$postToolsWrapperJSON = $xpath->query(".//div[@class='js_editor-tools sc-1i9kufk-0 ixGbux']", $article)->item(0)->getAttribute("data-state");
		$postToolsWrapper = json_decode($postToolsWrapperJSON);
		//$postId = $postToolsWrapper->postId;
		$postPermalink = $postToolsWrapper->postPermalink;

		if(preg_match("#^https://([^\.]+)\.kinja\.com/#", $postPermalink, $blog)) $blog = $blog[1];
		else {
			// This is a link "shared" from one of the main websites such as Jalopnik, no need to save this one
			echo("<p>Could not determine the blog from the Permalink ($postPermalink) on $url, this article is not backuped !</p>\n");
			continue;
		}


//		$postAuthorName = $postToolsWrapper->authorName;

		$postAuthor = $xpath->query(".//div[@class='sc-1mep9y1-0 sc-1ixdk2y-0 fjyRLU sc-1rye1-3 hwANEt']/a[@class='sc-1out364-0 hMndXN js_link']", $article)->item(0);
		$postAuthorName = $postAuthor->nodeValue;
		if(!$postAuthorLogin = preg_replace("#^https://kinja\.com/#", "", $postAuthor->getAttribute("href"))) die("Could not determine the article $postId author !");

		// Checking if the local archive for this article already exists
		$relativeArchivePath = "/archive/" . $blog . "/" . $postAuthorLogin . "/" . $postId . "/";
		$archivePath = __DIR__ . $relativeArchivePath;

		if(!forceUpdate && file_exists($archivePath . "archived")) {
			echo "<p>Article already archived as <a href='.${relativeArchivePath}article.html'>${relativeArchivePath}article.html</a> !</p>\n";
		} elseif(file_exists("fetchArticle.$postId.lock")) {
			echo "<p>Lock file on <a href='.${relativeArchivePath}article.html'>${relativeArchivePath}article.html</a> !</p>\n";
			continue;
		} else {
			// This article can an will be archived, creating a lockfile to avoid having two instance of the script fetching the same article at the same time and the lock file is kept if the operation failed making it easier to retrieve problematic articles
			file_put_contents("fetchArticle.$postId.lock", getmypid());
			if(fetchArticle($postPermalink)) $i++;
		}

		if(!file_exists($archivePath . "js_editor-tools.txt")) file_put_contents($archivePath . "js_editor-tools.txt", $postToolsWrapperJSON);
/*		if(!file_exists($archivePath . "datetime.txt")) file_put_contents($archivePath . "datetime.txt", $xpath->query(".//time", $article)->item(0)->getAttribute("datetime"));
		if(!file_exists($archivePath . "thumbnail")) {
			$thumbnailId = $xpath->query(".//div[@class='sc-1xh12qx-2 gmDVKF js_lazy-image']//img", $article)->item(0)->getAttribute("data-chomp-id");
			file_put_contents($archivePath . "thumbnail", file_get_contents("https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".$thumbnailId.".jpg")) or die("Could not retrieve the thumbnail for the article $postId !");
		}*/

		if($count > 0 && $i >= $count) {
			// $count articles have been archived, returning the UNIX Timestamp (with 4 more zeroes) of the current article
 			return strtotime($xpath->query(".//time", $article)->item(0)->getAttribute("datetime"))*1000;
		}
	}

	// $count articles have been archived, returning the UNIX Timestamp of the last one on the page (which is the oldest)
	if($count > 0) {
		// All the articles on the current page have already been archived, returning the timestamp of the "Next Page" button
		$nextPageTimestamp = $xpath->query(".//div[@class='sc-1uzyw0z-0 kNHeFZ']//a[@class='sc-1out364-0 hMndXN js_link']")->item(0);
		// If there is no link to a next page... then we are already on it, returning -1 means that all "old" posts have been fetched for this "blog"
		if(empty($nextPageTimestamp)) return -1;
		$nextPageTimestamp = $nextPageTimestamp->getAttribute("href") or die("Could not determine the timestamp for the next page !");

		if(substr($nextPageTimestamp, 11) == "?startTime=") {
			// Removing "?startTime=" from the href... if it's a mainpage
			return substr($nextPageTimestamp, 11);
		} else {
			// User posts list pages have ?startIndex= on their next page links so we have to determine the timestamp of the last article listed on the page and decrement it of 1
			return (strtotime($xpath->query(".//time", $article)->item(0)->getAttribute("datetime")) - 1) * 1000;
		}
	}
}

// Fetches the article, extract useful informations to only keep the article content itself, downloads localy a copy of the medias (videos and images) and make the article "static" (no JS or CSS or media fetched from Kinja) ; fetches the thumbnail (used on mainpages/users posts pages)
function fetchArticle($url) {
	// Checking the the post URL that has been passed as the "article" GET variable is a valid Kinja URL
	if(is_numeric($url)) $url = "https://oppositelock.kinja.com/$url";
	if(!filter_var($url, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://([A-z0-9-]{2,50})\.kinja\.com/([A-z0-9-_]+)$#i', $url, $article)) {
		die("Invalid article URL ($url) !");
	}

//	$blog = $article[1];
	$articleURI = $article[2];

	// Using DOMDocument to extract only the post content of the page (not the header, footer, "You may also like", Comments section and list of other posts ; also used to fix <img> src tags
	$article = new DOMDocument;

	// To avoid warnings for non standard tags on the HTML code
	$article->strictErrorChecking = false;
	$article->recover = true;

	// Fetching the content from the page on Kinja
	if(!@$article->loadHTMLFile($url)) die("Could not retrieve the page content");

	//$articleHead = $dom->getElementsByTagName("head")->item(0);

	// Getting the publish-date of the page
	$xpath = new DOMXPath($article);

	$articleHead = $xpath->query("/html/head")->item(0);

	// The date the article has been published
//	$publishDate = $xpath->query("./meta[@name='publish-date']", $articleHead)->item(0)->getAttribute("content") or die("Could not retrieve the publish-date");
	// Converting the date string
//	$publishDate = strtotime($publishDate);

	// Getting the title of the page (<title> tag)
	$pageTitle = $xpath->query("./title", $articleHead)->item(0)->nodeValue;

	// Getting the title of the post (<header> tag)
	if($postTitle = $xpath->query("/html/body//header")->item(0)) {
		// Some articles dont have a title and thus don't have the <header> tag
		if(!$postTitle = $postTitle->C14N()) die("Could not find the <header> with the title !");
	}

	// The $url can sometimes redirect to another blog, we fetch the "right" URL inside the "canonical" <link> to extract the right blog name of this article
	$postPermalink = $xpath->query("./link[@rel='canonical']", $articleHead)->item(0)->getAttribute("href") or die("Could not retrieve the canonical URL of this article ($url) !");

	if(preg_match("#^https://([^\.]+)\.kinja\.com/#", $postPermalink, $blog)) $blog = $blog[1];
	else die("<p>Could not determine the blog from the Permalink ($postPermalink) on $url, this article is not backuped !</p>\n");

	// The interesting content is in the "js_starterpost" <div>
	if(!$starterPost = $xpath->query("/html/body//div[@class='js_starterpost']")->item(0)) die("Could not find the 'js_starterpost' div !");

	// Extracting the JSON on the div post-tools-wrapper that contains the postId (the unique numeric identfier of each posts)
	$toolsDiv = $xpath->query(".//div[@class='sc-83o472-2 hNtXBx']", $starterpost)->item(0);
	$postToolsWrapper = json_decode($xpath->query(".//div[@class='post-tools-wrapper']", $toolsDiv)->item(0)->getAttribute("data-state"));
	$postId = $postToolsWrapper->postId;
	if(empty($postId)) die("Could not determine the post Id !");

	// Removal of the "js_share-tools" <div> that contains the "Share to Facebook/Twitter/Email/Link" buttons that are of no use
	$jsShareTools = $xpath->query("./div[@class='js_share-tools']", $toolsDiv)->item(0);
	$jsShareTools->parentNode->removeChild($jsShareTools);

	// Retrieving the author name and his Kinja login ; it is not in the first "<a class='sc-1out364-0 hMndXN js_link'>" of the js_starterpost but in the second...
	if(!$postAuthor = $xpath->query(".//div[@class='sc-1jc3ukb-2 fUsAEy']//a[@class='sc-1out364-0 hMndXN js_link']", $starterPost)->item(0)) die("Could not find the 'js_link' div !");
	$postAuthorName = $postAuthor->nodeValue;
	$postAuthorLogin = preg_replace("|^https://kinja\.com/|", "", $postAuthor->getAttribute("href"));
	if(empty($postAuthorName) || empty($postAuthorLogin)) die("Could not determine the author name or login !");
	if(!preg_match('/^[A-z0-9_-]{2,50}$/', $postAuthorLogin)) die("Invalid author login ($postAuthorLogin) !");

	// Checking if the local archive for this article already exists
	//$archivePath = date("y/n/j", $publishDate);
	$relativeArchivePath = "/archive/" . $blog . "/" . $postAuthorLogin . "/" . $postId . "/";
	$archivePath = __DIR__ . $relativeArchivePath;

	// If the article has not been archived yet, it needs the data folder to be created
	if(!is_dir($archivePath) && !mkdir($archivePath, 0750, True)) die("Could not create the archive folder for the article $postId !");

	if(file_exists($archivePath . "archived")) {
		// The page has already been archive, redirecting (301) to it
//// TODO: Remove comment for production use
////		header("Location: ." . $relativeArchivePath . "article.html");
////		die("Should have redirected to the already generated archive page <a href='.${relativeArchivePath}article.html'>${relativeArchivePath}article.html</a> !");
	}

//	if(!file_exists($archivePath . "datetime.txt")) file_put_contents($archivePath . "datetime.txt", $xpath->query("./meta[@name='publish-date']", $articleHead)->item(0)->getAttribute("content")) or die("Could not retrieve the publish-date");

	// Saving the thumbnail of the article in the resolution used by the mainpage/author posts page
	if(!file_exists($archivePath . "thumbnail")) {
		// The post thumbnail is on the content attribute of the header <meta property="og:image"> but not in the small resolution used by the main pages, fetching the version that is 265px wide instead (same filename with only the previous part of the URL changing)
		$thumbnailURL = $xpath->query("./meta[@property='og:image']", $articleHead)->item(0)->getAttribute("content");
		// Articles without a thumbnail have their og:image pointing to a placeholder image, no need to fetch it
		if($thumbnailURL != "https://x.kinja-static.com/assets/images/logos/placeholders/default.png") file_put_contents( $archivePath . "thumbnail", file_get_contents("https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".basename($thumbnailURL)) ) or die("Could not retrieve the thumbnail (https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".basename($thumbnailURL).") for the article $postId !");
	}

	// Fetches the blog (Oppositelock or LALD for example) favicon if not already in the archive
	if(!file_exists("./favicon/".$blog.".png")) {
		// The blog favicon is not present, it should be fetched
		$favicon = $xpath->query("./link[@rel='shortcut icon']", $articleHead)->item(0);
		if(!empty($favicon) && $favicon->hasAttribute("href")) {
			// On some cases... the URL has no protocol defined (starting with //) which does not work with PHP
			if(substr($favicon->getAttribute("href"), 0, 2) == "//") $url = "https:".$favicon->getAttribute("href");
			else $url = &$favicon->getAttribute("href");
			file_put_contents(__DIR__ . "/favicon/".$blog.".png", file_get_contents($url) );
		}
	}

	// Correcting the author avatar, can be in a <img> or a <video> tag
	if(!$postAuthorAvatarImg = $xpath->query(".//div[@class='sc-1jc3ukb-1 flaMVg']//img[@data-alt='".$postAuthorLogin."']", $starterPost)) die("Could not find the avatar img !");
	fixImages($postAuthorAvatarImg, $article, $archivePath, $postAuthorLogin, True);

	// Video avatars have a specific format... a mp4 and a webm, each on their own child <source>
	if($postAuthorAvatarVideo = $xpath->query(".//div[@class='sc-1jc3ukb-1 flaMVg']//video[@data-alt='".$postAuthorLogin."']", $starterPost)->item(0)) {
		foreach($postAuthorAvatarVideo->childNodes as $child) {
// TODO: Refactoring with fixImages()
			$videoSrc = $child->getAttribute("src") or die("Could not retrieve the src of the video avatar of $postAuthorLogin !");
			if(!filter_var($videoSrc, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://i\.kinja-img\.com/#', $videoSrc)) die("Invalid image URL ($videoSrc) !");

			// Extraction of the file extension
			$fileExtension = substr($videoSrc, strrpos($videoSrc, "." ) + 1);

			// This is an avatar, it is saved as ./avatars/$postAuthorLogin.extension
			if( !file_exists(__DIR__ . "/archive/avatars/${postAuthorLogin}.${fileExtension}") && !file_put_contents(__DIR__ . "/archive/avatars/${postAuthorLogin}.${fileExtension}", file_get_contents($videoSrc)) ) die("Could not retrieve $videoSrc !");

			// Setting each <content> src with the one on the archive
			$child->setAttribute("src", "../../../avatars/${postAuthorLogin}.${fileExtension}");
		}

		// The "poster" attribute of a <video> is the image displayed while the video is loading, no need to bother download it locally
		$postAuthorAvatarVideo->removeAttribute("poster");
	}

	// Finding the <img> and <video> elements of the post
	$images = $xpath->query(".//img", $starterPost);
	$videos = $xpath->query(".//video", $starterPost);

	// Replacing the <img src=''> blank placeholder by the URL of the biggest image listed on the data-srcset of the <img> tag
	fixImages($images, $article, $archivePath, $postAuthorLogin);
	// Same with the <video> elements
	fixImages($videos, $article, $archivePath, $postAuthorLogin);

	// Some embedded content are on iframes but pointing to a Kinja URL rather than directly to YouTube or Imgur for example : Replacing it with the direct iframe to the service
	$iframes = $xpath->query(".//iframe", $starterPost);
	foreach($iframes as $iframe) {
		// The Kinja equivalent of the src attribute (the target URL of the iframe) is on the data-src attribute
		if($datasrc = $iframe->getAttribute("data-src")) {
			preg_match('#^https://[0-9A-z-]+\.kinja\.com/ajax/inset/iframe\?id=(.+)$#', $datasrc, $params);
			if(!isset($params[1])) {
				// This is a "generic" embedding that is not going through the Kinja server (eg: Google Maps) ; the data-src is directly copied on the src
				if(!$iframe->hasAttribute("src")) $iframe->setAttribute("src", addslashes($datasrc));
			} elseif(preg_match('#^youtube-video-([^&\#]+)&?(.*?)$#', $params[1], $youtubeParams)) {
				// The first match will contain the youtubeVideoID and the second the optional parameters such as start=X ; adding the "src" attribute to the iframe with the non-Kinja embedding
				$iframe->setAttribute("src", "https://www.youtube.com/embed/".addslashes($youtubeParams[1])."?".addslashes($youtubeParams[2]));
			} elseif(preg_match('#^youtube-list-(.+?)%2F.*?(&.+)?$#', $params[1], $youtubeParams)) {
				// Seems like youtube-list is also used... for single videos (video ID before the %2F)...
//				$iframe->setAttribute("src", "https://www.youtube.com/embed?listType=playlist&list=".addslashes($youtubeParams[1]));
				$iframe->setAttribute("src", "https://www.youtube.com/embed/".addslashes($youtubeParams[1])."?".addslashes($youtubeParams[2]));
			} elseif(preg_match('#^twitter-([0-9]+)&(.*)$#', $params[1], $twitterParams)) {
				$iframe->setAttribute("src", "https://platform.twitter.com/embed/index.html?dnt=true&id=".addslashes($twitterParams[1])."&".addslashes($twitterParams[2]));
				// Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code
				if($iframe->getAttribute("height") < 520) $iframe->setAttribute("height", 520);
			} elseif(preg_match('#^instagram-([^&]+)&?(.*)$#', $params[1], $instagramParams)) {
				$iframe->setAttribute("src", "https://www.instagram.com/p/".addslashes($instagramParams[1])."/embed/");
				// Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code
				if($iframe->getAttribute("height") < 802) $iframe->setAttribute("height", 802);
			} elseif(preg_match('#^vimeo-(.+)$#', $params[1], $vimeoParams)) {
				$iframe->setAttribute("src", "https://player.vimeo.com/video/".addslashes($vimeoParams[1]));
			} elseif(preg_match('#^imgur-(a%2F)?([^&]+)#', $params[1], $imgurParams)) {
				if($imgurParams[1] == "a%2F")	$iframe->setAttribute("src", "https://imgur.com/a/".addslashes($imgurParams[2])."/embed?pub=true&w=540");
				else				$iframe->setAttribute("src", "https://imgur.com/".addslashes($imgurParams[2])."/embed?pub=true&w=540");
				// Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code
				if($iframe->getAttribute("height") < 500) $iframe->setAttribute("height", 500);
			} elseif(preg_match('#^fbpost-(https%3A%2F%2Fwww.facebook.com%2F.+)$#', $params[1], $facebookParams)) {
				// Facebook post
				$iframe->setAttribute("src", "https://www.facebook.com/plugins/post.php?href=".addslashes($facebookParams[1]));
				// Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code
				if($iframe->getAttribute("height") < 750) $iframe->setAttribute("height", 750);
			} elseif(preg_match('#^fb-([0-9]+)#', $params[1], $facebookParams)) {
				// Facebook video
				$iframe->setAttribute("src", "https://www.facebook.com/v2.3/plugins/video.php?allowfullscreen=true&app_id=&container_width=636&href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D".$facebookParams[1]."&sdk=joey");
			} elseif(preg_match('#^dm-(.*)$#', $params[1], $dailymotionParams)) {
				// DailyMotion video
				$iframe->setAttribute("src", "https://www.dailymotion.com/embed/video/".addslashes($dailymotionParams[1]));
			} elseif(preg_match('#^soundcloud-(.*)$#', $params[1], $soundcloudParams)) {
				// Soundcloud track
				$iframe->setAttribute("src", "https://w.soundcloud.com/player/?url=https%3A//api.soundcloud.com/tracks/".urlencode(addslashes($soundcloudParams[1]))."&amp;auto_play=false&amp;hide_related=false&amp;show_comments=true&amp;show_user=true&amp;show_reposts=false&amp;visual=true&amp;show_playcount=true");
			} elseif(preg_match('#^tiktok-([^&]+)#', $params[1], $tiktokParams)) {
				// TikTok post
				$iframe->setAttribute("src", "https://www.tiktok.com/embed/".addslashes($tiktokParams[1]));
				// Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code
				if($iframe->getAttribute("height") < 750) $iframe->setAttribute("height", 750);
			} elseif(preg_match('#^vine-([^&]+)#', $params[1], $vineParams)) {
				$iframe->setAttribute("src", "https://vine.co/v/".addslashes($vineParams[1])."/embed/postcard");
			} elseif(preg_match('#^polldaddy-tag-([^&]+)#', $params[1], $polldaddyParams)) {
				$iframe->setAttribute("src", "https://poll.fm/".addslashes($polldaddyParams[1])."/embed");
			} elseif(preg_match('#^tumblr-post-([^&]+)#', $params[1], $tumblrParams)) {
				$iframe->setAttribute("src", "https://embed.tumblr.com/embed/post/4pJgfiHapNNMJEYz1h7DBw/".addslashes($tumblrParams[1])."?width=542&language=en_US");
// TODO: Fix Tumblr embedding !
				echo "<p>Tumblr embedding is not supported at the moment (post $postId) !</p>";
				$error = True;
			} elseif(preg_match('#^twitch-stream-([^&]+)#', $params[1], $twitchParams)) {
				// Twitch stream iframe : Requires HTTPS to work and to pass the domain of the page as the parent parameter
				$iframe->setAttribute("src", "https://player.twitch.tv/?channel=".addslashes($twitchParams[1])."&parent=".$_SERVER["HTTP_HOST"]);
			} else {
				die("Unsupported iframe embedding : ".$params[1]." !");
			}
		}
	}

	// <figure> elements
	$figures = $xpath->query(".//figure", $starterpost);
	foreach($figures as $figure) {
		// Images are inside of <figure> that has the width of the image defined on the style tag of it, breaking the centering of the images !
		if( $figure->hasAttribute("class") && preg_match('/(^| )align--center($| )/', $figure->getAttribute("class")) ) {
			// This image is supposed to be centered, removing it's problematic style="width: *px;" tag
			$figure->removeAttribute("style");
		}
	}

	// Image galleries have the photos on a <picture> container
	$pictures = $xpath->query(".//picture", $starterpost);
// TODO: Refactoring with fixImages()
	foreach($pictures as $picture) {
		// The images are both in a <source> and an <img> container, the best resolution picture is not always on the same though...
		$maxResolution = $imgSrc = NULL;

		// Looping on each element (normally one <source> and one <img> to find the biggest resolution available
		foreach($picture->childNodes as $child) {
			// <source> have the image URL on their "srcset" tag while <img> have on their "src" tag
			if($child->hasAttribute("src"))		$childSrc = $child->getAttribute("src");
			elseif($child->hasAttribute("srcset"))	$childSrc = $child->getAttribute("srcset");
			else die("Could not determine any src for the image inside a <picture> !");

			// The resolution is part of the URL, using a regex to extract it
//			if(!preg_match('#_([0-9]+)/[A-z0-9.-_]+$#', $childSrc, $imgResolution)) die("Could not determine the resolution of $childSrc");
			if(!preg_match('#(_([0-9]+)|/upload)/[A-z0-9.-_]+$#', $childSrc, $imgResolution)) die("Could not determine the resolution of $childSrc");

			// The resolution in the srcset is for example "320w" for an image of a width of 320px ; they are not ordered, looping on each to find the one with the biggest resolution is required
			if($imgResolution[1] == "/upload" || $imgResolution[2] > $maxResolution) {
				// This image is the biggest so far
				$imgSrc = $childSrc;
				$maxResolution = $imgResolution[2];
				// The image is on a format without the size, meaning that only one image size is available, no need to continue
				if($imgResolution[1] == "/upload") break;
			}
		}

		if(!empty($imgSrc)) {
			if(!filter_var($imgSrc, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://i\.kinja-img\.com/#', $imgSrc)) die("Invalid image URL ($imgSrc) !");
			// Keeping the same filename as on te Kinja server
			$imgFilename = basename($imgSrc);

			// Downloading the image
			if( !file_exists($archivePath . $imgFilename) && !file_put_contents($archivePath . $imgFilename, file_get_contents($imgSrc)) ) die("Could not retrieve $imgSrc !");
// TODO: Find a solution to display the galleries on the pages

			// Replacing the image srcset with the one on the archive on both <source> and <img> ; using src on a <source> is not supported anymore
			foreach($picture->childNodes as $child) {
//				$child->setAttribute("src", "./" . $imgFilename);
//				$child->removeAttribute("srcset");
				$child->setAttribute("srcset", "./" . $imgFilename);
				$child->removeAttribute("src");
			}
		} else die("Could not determine the URL of a <picture> !");
	}

	// Extracting the HTML code of the header containing the poster name, the date of the post and views and comments count which is on the first <div> inside of js_starterpost
	$postHeader = $starterPost->childNodes->item(0)->C14N();
	// The article itself is the second one (<div class="js_post-content">)
	$postContent = $starterPost->childNodes->item(1)->C14N();

	// HTML code of the new "article.html"
	$newPage = '<!DOCTYPE html><html lang="en-us" data-reactroot="">

<head>
	<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0,maximum-scale=10.0">
	<meta charset="utf-8">
	<!-- Copies of the Kinja CSSs -->
<!--	<link href="/oppo/extractor/style1-unique.css" rel="stylesheet" data-styled="" data-styled-version="5.1.0">-->
	<link href="./style-inline.css" rel="stylesheet" data-styled="" data-styled-version="5.1.0">
	<link href="/oppo/extractor/style2.css" rel="stylesheet">
	<link href="/oppo/extractor/style3.css" rel="stylesheet" id="509bc78f-a82c-4eb8-9919-3bd416e0c2ca">

	<link rel="shortcut icon" type="image/png" href="' . "/oppo/extractor/favicon/${blog}.png" .'">

	<!-- Those CSS/JS are not from Kinja -->
	<style class="Kinja-Extracted">/*div.js_post-content img, div.image-hydration-wrapper { max-width: 800px; }*/ aside img { width: 160px; height: 90px; } aside a { width: 160px !important; height: 110px !important; }</style>
	<title>' . $pageTitle . '</title>
</head>

<body>
	<div class="sc-157agsr-0 krcHQb">
		<main class="sc-11qwj9y-0 dDyFIk iBFwie">
			<div class="sc-1lmpmkf-1 gyRthi">
				' . $postTitle . '
			</div>
			<div class="js_starterpost">
				' . $postHeader . "\n" . $postContent . '
			</div>
		</main>
	</div>
	<script type="text/javascript" src="/oppo/extractor/extractor.js"></script>
</body>';

	//<style data-styled="" data-styled-version="5.1.0">
	// Getting the per page generated CSS (inside <style data-styled="" data-styled-version="5.1.0")
	$pageCSS = $xpath->query("./style[@data-styled-version]", $articleHead)->item(0)->nodeValue;

	// There can be images and fonts linked to Kinja inside the CSS (using url:), used for example on backgrounds of image galleries
//	if(preg_match_all('#url\((https://i\.kinja-img\.com/[^)]+)#', $pageCSS, $imagesOnCSS)) {
	if(preg_match_all('#url\((https://(i\.kinja-img|f\.kinja-static)\.com/[^)?]+)#', $pageCSS, $imagesOnCSS)) {
		// Loop on each image linked on the CSS
		if(isset($imagesOnCSS[1])) foreach($imagesOnCSS[1] as $imgSrc) {
			if(!filter_var($imgSrc, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED))) die("Invalid image URL ($imgSrc) inside the CSS !");

			$imgFilename = basename($imgSrc);

			if(substr($imgSrc, 8, 2) == "f.") {
				// It's a font (URL starting with https://f.) and not an image
				if(!file_exists(__DIR__ . "/archive/fonts/" . $imgFilename) && !file_put_contents(__DIR__ . "/archive/fonts/" . $imgFilename, file_get_contents($imgSrc)) ) die("Could not retrieve the font $imgSrc listed on the CSS !");

				// Replacing the URL on the CSS with the relative URI
				$pageCSS = preg_replace("#\(".$imgSrc."(\?.*?)?\)#", "(../../../fonts/".$imgFilename.")", $pageCSS);
			} else {
				// If the image has not already been fetched (it's usually already done by fixImages(), fetching it
				if(!file_exists($archivePath . $imgFilename) && !file_put_contents($archivePath . $imgFilename, file_get_contents($imgSrc)) ) die("Could not retrieve the image $imgSrc listed on the CSS !");

				// Replacing the URL on the CSS with the relative URI
				$pageCSS = preg_replace("#\(".$imgSrc."\)#", "(./".$imgFilename.")", $pageCSS);
			}
		}
	}

	// Saving the CSS that was inline on the page and generated specifically for this page
	file_put_contents($archivePath . "style-inline.css", $pageCSS) or die("Could not save the page specific CSS !");

	// Saving the generated page
	file_put_contents($archivePath . "article.html", $newPage) or die("Could not save the new page content !");

	// Saving the article JSON (including comments) from the API
//	file_put_contents($archivePath . "articleMetadatas.json", file_get_contents("https://oppositelock.kinja.com/api/core/corepost/getList?id=".$postId));
// TODO: Parse to fetch the image/videos from comments
	file_put_contents($archivePath . "articleMetadatas.json", file_get_contents("https://kinja.com/ajax/comments/views/flatReplies/".$postId."?startIndex=0&maxReturned=5000&approvedOnly=false&cache=true&sorting=oldest"));

	// Creating the "archived" file as everything has been fetched for this article
	file_put_contents($archivePath . "archived", "1");

	// Removal of the lockfile for this article as it has been successfully archived or creation of a lock file if the function was called manually (and thus not creating a lockfile) and an error has been encountered
	if(!isset($error) && file_exists("fetchArticle.$postId.lock"))		unlink("fetchArticle.$postId.lock");
	elseif(isset($error) && !file_exists("fetchArticle.$postId.lock"))	file_put_contents("fetchArticle.$postId.lock", getmypid());

	// The page has been archived, redirecting (301) to it
	////header("Location: ." . $relativeArchivePath . "article.html");
	echo("<p>The generated archive page is available at <a href='.${relativeArchivePath}article.html'>${relativeArchivePath}article.html</a> !</p>\n");

	return True;
}

// Fix the blank images and videos when not loading the Kinja JS
function fixImages($elements, $article, $archivePath, $postAuthorLogin, $avatar = False) {
//	global $article, $archivePath, $postAuthorLogin;
	// Loop on each <img> or <video> on the page
	foreach ($elements as $element) {
		// If the <img> src is a placeholder, nothing will be displayed ; replacing it with the proper URL of the image
		if($element->getAttribute("src") == "data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" || $element->getAttribute("poster") == "data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==") {
			// Putting the <img> or <video> inside a <a> to open it fullsize
/*			if(!$avatar && !empty($element->getAttribute("alt"))) {
				// The images on a <aside> are already on a <a> and they don't have an alt value
				$aElement = $article->createElement("a");
				// Open the image on a new tab
				$aElement->setAttribute("target", "_blank");
			} else unset($aElement);*/

			// A working URL for the image can be found on the "data-srcset" tag of the image that lists multiple size of images (or the attribute "srcset" for the avatars) ; the best resolution available will be used
			$srcset = $element->getAttribute("data-srcset");
			if(empty($srcset)) $srcset = $element->getAttribute("srcset");

			if(!empty($srcset)) {
				$srcset = explode(", ", $srcset);

				$maxResolution = $imgSrc = $imgSrcSmall = NULL;
				// Looping on each images on the srcset (that contains the same image in multiple resolutions) to find the biggest resolution available
				foreach($srcset as $src) {
					$src = explode(" ", $src);
					$resolution = substr($src[1], 0, -1);
					// The resolution in the srcset is for example "320w" for an image of a width of 320px ; they are not ordered, looping on each to find the one with the biggest resolution is required
					if($resolution > $maxResolution) {
						// This image is the biggest so far
						$imgSrc = $src[0];
						$maxResolution = $resolution;
					}
					// The default size used for avatars is 80x80, fetching it as well as the full size avatar images that are not always on the right aspect ratio (and agressive resizing by browsers is not always working well)
					if($avatar && $resolution == 80) $imgSrcSmall = $src[0];
				}

// TODO: Should the "data-srcset" attribute be renamed as "srcset"? (or simply removed if only the max resolution image is imported)
				if(!empty($imgSrc)) {
					if(!filter_var($imgSrc, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://i\.kinja-img\.com/#', $imgSrc)) die("Invalid image URL ($imgSrc) !");

					// Saving the image on the article archive folder
					if(!$avatar) {
						// It's an image from an article, if it has not already been done, saving it on the archive folder of the article while keeping it's original filename

						// Keeping the same filename as on te Kinja server
						$imgFilename = basename($imgSrc);

						if( !file_exists($archivePath . $imgFilename) && !file_put_contents($archivePath . $imgFilename, file_get_contents($imgSrc)) ) die("Could not retrieve $imgSrc !");

						// Replacing the image src with the one on the archive
						$element->setAttribute("src", "./" . $imgFilename);
//						if(isset($aElement)) $aElement->setAttribute("href", "./" . $imgFilename);
					} else {
						// This is an avatar, it is saved as ./avatars/$postAuthorLogin, the extension is omitted to simplyify the check if the avatar is already here (or if the user changes it's avatar for a different filetype at some point)
						if( !file_exists(__DIR__ . "/archive/avatars/$postAuthorLogin") && !file_put_contents(__DIR__ . "/archive/avatars/$postAuthorLogin", file_get_contents($imgSrc)) ) die("Could not retrieve $imgSrc !");

						// If available, fetching the 80x80 avatar and using it
						if(!empty($imgSrcSmall) && !file_exists(__DIR__ . "/archive/avatars/${postAuthorLogin}.small")) file_put_contents(__DIR__ . "/archive/avatars/${postAuthorLogin}.small", file_get_contents($imgSrcSmall));

						if(file_exists(__DIR__ . "/archive/avatars/${postAuthorLogin}.small")) {
							// Replacing the image src with the one in 80x80 on the avatars folder
							$element->setAttribute("src", "../../../avatars/${postAuthorLogin}.small");
						} else {
							// Replacing the image src with the one in the maximum resolution on the avatars folder
							$element->setAttribute("src", "../../../avatars/$postAuthorLogin");
						}

						// To be sure that the navigator won't fetch images from Kinja, the srcset is removed ; other attribute with links to images on the Kinja servers are also removed
						$element->removeAttribute("srcset");
						$element->removeAttribute("data-srcset");
					}
				}
			} elseif($mp4src = $element->getAttribute("data-mp4src")) {
				// This is a <video> with a MP4 content, creating <content> childen with the MP4 and WEBM formats
				if(!filter_var($mp4src, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://i\.kinja-img\.com/#', $mp4src)) die("Invalid image URL ($mp4src) !");

//				$element->setAttribute("src", $mp4src);
				$mp4Filename = basename($mp4src);

				$contentMP4 = $article->createElement("source");
				$contentMP4->setAttribute("type", "video/mp4");
				$webmsrc = preg_replace('/\.mp4$/', ".webm", $mp4src);
				$webmFilename = basename($webmsrc);
				$contentWEBM = $article->createElement("source");
				$contentWEBM->setAttribute("type", "video/webm");

//				if(!$avatar) {
					if( !file_exists($archivePath . $mp4Filename) && !file_put_contents($archivePath . $mp4Filename, file_get_contents($mp4src)) ) die("Could not retrieve $mp4src !");

					// Setting each <content> src with the one on the archive
					$contentMP4->setAttribute("src", $mp4src);

					// Only set the src for webm if the file has been retrieved
					if( !file_exists($archivePath . $webmFilename) && file_put_contents($archivePath . $webmFilename, file_get_contents($webmsrc)) ) $contentWEBM->setAttribute("src", $webmsrc);
/*				} else {
					// This is an avatar, it is saved as ./avatars/$postAuthorLogin.extension
					if( !file_exists(__DIR__ . "/archive/avatars/${postAuthorLogin}.mp4") && !file_put_contents(__DIR__ . "/archive/avatars/${postAuthorLogin}.mp4", file_get_contents($mp4src)) ) die("Could not retrieve $mp4src !");

					// Setting each <content> src with the one on the archive
					$contentMP4->setAttribute("src", "../../../avatars/${postAuthorLogin}.mp4");

					// Only set the src for webm if the file has been retrieved
					if( !file_exists("/archive/avatars/${postAuthorLogin}.webm") && file_put_contents(__DIR__ . "/archive/avatars/${postAuthorLogin}.webm", file_get_contents($webmsrc)) ) $contentWEBM->setAttribute("src", "../../../avatars/${postAuthorLogin}.webm");
				}*/

				$element->appendChild($contentMP4);
				$element->appendChild($contentWEBM);
//				if(isset($aElement)) $aElement->setAttribute("href", $webmsrc);
			}

/*			if(isset($aElement)) {
				// Duplicate the <img> or <video> as a child of the new <a>
				$aElement->appendChild($element->cloneNode(True));
				// Replace the <img> or <video> with the new <a><img|video>
				$element->parentNode->replaceChild($aElement, $element);
			}*/
		}
	}
	return True;
}
?>