100) die("Invalid maxReturned value, must be between 1 and 100 !"); if(!is_dir(__DIR__ . "/archive/$blog")) mkdir(__DIR__ . "/archive/$blog"); // First execution of the function, starting at the current date if(!file_exists(__DIR__ . "/archive/$blog/autoOldest")) { // The PHP timestamp is in second while the Kinja one is in milliseconds, adding 4 zeroes by multiplying $oldest = time()*1000; file_put_contents(__DIR__ . "/archive/$blog/autoNewest", $oldest); } else { // Getting the date of the last fetched post $oldest = (int) trim(file_get_contents(__DIR__ . "/archive/$blog/autoOldest")); if($oldest == -1) die("All posts older than ".trim(file_get_contents(__DIR__ . "/archive/$blog/autoNewest"))." have already been retrieved for $blog !"); if($oldest < 1000000000000) die("Invalid timestamps on ". __DIR__ . "/archive/$blog/autoOldest ($oldest) !"); } if(!is_int($oldest)) die("Invalid autoOldest data ($oldest) for $blog !"); // The URL is not the same between blogs and users posts (if there is a "/" after the login for users, it does not work) if(!$user) $url = "https://${blog}.kinja.com/?startTime=${oldest}&maxReturned=${maxReturned}"; // TODO: startTime does not work for users... else $url = "https://kinja.com/${blog}?startTime=${oldest}&maxReturned=${maxReturned}"; // The blogs mainpages can start with posts after a certain timestamp by passing the GET parameter "startTime" if($newOldest = fetchArticlesFromPage($url, max($count, 1))) { if(!is_numeric($newOldest)) die("Invalid newOldest timestamp($newOldest) for $blog !"); if($newOldest > $oldest) die("The new oldest timestamp is newer than the previous !"); // $count articles have been archived (or all the available ones have already been archived), saving the timestamp to start at for the next execution of the script file_put_contents(__DIR__ . "/archive/$blog/autoOldest", $newOldest); } else die("Error calling fetchArticlesFromPage() !"); } // Used to fetch all articles listed on a page such as a blog mainpage or a user posts page ; also saves the information that are in JSON on the js_editor-tools div // $count is the maximum number of articles to fetch ; 0 being "unlimited" (limited by what is listed on the request page) function fetchArticlesFromPage($url, $count = 0) { // Checking that the URL that has been passed is a valid Kinja "mainpage" (or user page) URL if(!filter_var($url, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://([A-z0-9]{2,50}\.kinja\.com(/?|/tag/[A-z0-9-_]+|/search/?\?[A-z0-9-_&=%]+$)|kinja\.com/([A-z0-9-_]+))(/?\?.+)?$#i', $url)) { die("Invalid mainpage URL ($url) !"); } // Using DOMDocument to extract only the post content of the page (not the header, footer, "You may also like", Comments section and list of other posts ; also used to fix src tags $mainpage = new DOMDocument; // To avoid warnings for non standard tags on the HTML code $mainpage->strictErrorChecking = false; $mainpage->recover = true; // Fetching the content from the page on Kinja if(!@$mainpage->loadHTMLFile($url)) die("Could not retrieve the page content"); $xpath = new DOMXPath($mainpage); //$mainpageHead = $xpath->query("/html/head")->item(0); $articles = $xpath->query("/html/body//article"); if(count($articles) == 0) die("No article found on $url !"); $i = 0; // Loop on each
on the page foreach($articles as $article) { $postId = $article->getAttribute("data-id"); if(empty($postId)) die("Could not determine the post Id !"); // Extracting the JSON on the div js_editor-tools that contains the URL to the post (postPermalink) $postToolsWrapperJSON = $xpath->query(".//div[@class='js_editor-tools sc-1i9kufk-0 ixGbux']", $article)->item(0)->getAttribute("data-state"); $postToolsWrapper = json_decode($postToolsWrapperJSON); //$postId = $postToolsWrapper->postId; $postPermalink = $postToolsWrapper->postPermalink; if(preg_match("#^https://([^\.]+)\.kinja\.com/#", $postPermalink, $blog)) $blog = $blog[1]; else { // This is a link "shared" from one of the main websites such as Jalopnik, no need to save this one echo("

Could not determine the blog from the Permalink ($postPermalink) on $url, this article is not backuped !

\n"); continue; } // $postAuthorName = $postToolsWrapper->authorName; $postAuthor = $xpath->query(".//div[@class='sc-1mep9y1-0 sc-1ixdk2y-0 fjyRLU sc-1rye1-3 hwANEt']/a[@class='sc-1out364-0 hMndXN js_link']", $article)->item(0); $postAuthorName = $postAuthor->nodeValue; if(!$postAuthorLogin = preg_replace("#^https://kinja\.com/#", "", $postAuthor->getAttribute("href"))) die("Could not determine the article $postId author !"); // Checking if the local archive for this article already exists $relativeArchivePath = "/archive/" . $blog . "/" . $postAuthorLogin . "/" . $postId . "/"; $archivePath = __DIR__ . $relativeArchivePath; if(!forceUpdate && file_exists($archivePath . "archived")) { echo "

Article already archived as ${relativeArchivePath}article.html !

\n"; } elseif(file_exists("fetchArticle.$postId.lock")) { echo "

Lock file on ${relativeArchivePath}article.html !

\n"; continue; } else { // This article can an will be archived, creating a lockfile to avoid having two instance of the script fetching the same article at the same time and the lock file is kept if the operation failed making it easier to retrieve problematic articles file_put_contents("fetchArticle.$postId.lock", getmypid()); if(fetchArticle($postPermalink)) $i++; } if(!file_exists($archivePath . "js_editor-tools.txt")) file_put_contents($archivePath . "js_editor-tools.txt", $postToolsWrapperJSON); /* if(!file_exists($archivePath . "datetime.txt")) file_put_contents($archivePath . "datetime.txt", $xpath->query(".//time", $article)->item(0)->getAttribute("datetime")); if(!file_exists($archivePath . "thumbnail")) { $thumbnailId = $xpath->query(".//div[@class='sc-1xh12qx-2 gmDVKF js_lazy-image']//img", $article)->item(0)->getAttribute("data-chomp-id"); file_put_contents($archivePath . "thumbnail", file_get_contents("https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".$thumbnailId.".jpg")) or die("Could not retrieve the thumbnail for the article $postId !"); }*/ if($count > 0 && $i >= $count) { // $count articles have been archived, returning the UNIX Timestamp (with 4 more zeroes) of the current article return strtotime($xpath->query(".//time", $article)->item(0)->getAttribute("datetime"))*1000; } } // $count articles have been archived, returning the UNIX Timestamp of the last one on the page (which is the oldest) if($count > 0) { // All the articles on the current page have already been archived, returning the timestamp of the "Next Page" button $nextPageTimestamp = $xpath->query(".//div[@class='sc-1uzyw0z-0 kNHeFZ']//a[@class='sc-1out364-0 hMndXN js_link']")->item(0); // If there is no link to a next page... then we are already on it, returning -1 means that all "old" posts have been fetched for this "blog" if(empty($nextPageTimestamp)) return -1; $nextPageTimestamp = $nextPageTimestamp->getAttribute("href") or die("Could not determine the timestamp for the next page !"); if(substr($nextPageTimestamp, 11) == "?startTime=") { // Removing "?startTime=" from the href... if it's a mainpage return substr($nextPageTimestamp, 11); } else { // User posts list pages have ?startIndex= on their next page links so we have to determine the timestamp of the last article listed on the page and decrement it of 1 return (strtotime($xpath->query(".//time", $article)->item(0)->getAttribute("datetime")) - 1) * 1000; } } } // Fetches the article, extract useful informations to only keep the article content itself, downloads localy a copy of the medias (videos and images) and make the article "static" (no JS or CSS or media fetched from Kinja) ; fetches the thumbnail (used on mainpages/users posts pages) function fetchArticle($url) { // Checking the the post URL that has been passed as the "article" GET variable is a valid Kinja URL if(is_numeric($url)) $url = "https://oppositelock.kinja.com/$url"; if(!filter_var($url, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://([A-z0-9-]{2,50})\.kinja\.com/([A-z0-9-_]+)$#i', $url, $article)) { die("Invalid article URL ($url) !"); } // $blog = $article[1]; $articleURI = $article[2]; // Using DOMDocument to extract only the post content of the page (not the header, footer, "You may also like", Comments section and list of other posts ; also used to fix src tags $article = new DOMDocument; // To avoid warnings for non standard tags on the HTML code $article->strictErrorChecking = false; $article->recover = true; // Fetching the content from the page on Kinja if(!@$article->loadHTMLFile($url)) die("Could not retrieve the page content"); //$articleHead = $dom->getElementsByTagName("head")->item(0); // Getting the publish-date of the page $xpath = new DOMXPath($article); $articleHead = $xpath->query("/html/head")->item(0); // The date the article has been published // $publishDate = $xpath->query("./meta[@name='publish-date']", $articleHead)->item(0)->getAttribute("content") or die("Could not retrieve the publish-date"); // Converting the date string // $publishDate = strtotime($publishDate); // Getting the title of the page ( tag) $pageTitle = $xpath->query("./title", $articleHead)->item(0)->nodeValue; // Getting the title of the post (<header> tag) if($postTitle = $xpath->query("/html/body//header")->item(0)) { // Some articles dont have a title and thus don't have the <header> tag if(!$postTitle = $postTitle->C14N()) die("Could not find the <header> with the title !"); } // The $url can sometimes redirect to another blog, we fetch the "right" URL inside the "canonical" <link> to extract the right blog name of this article $postPermalink = $xpath->query("./link[@rel='canonical']", $articleHead)->item(0)->getAttribute("href") or die("Could not retrieve the canonical URL of this article ($url) !"); if(preg_match("#^https://([^\.]+)\.kinja\.com/#", $postPermalink, $blog)) $blog = $blog[1]; else die("<p>Could not determine the blog from the Permalink ($postPermalink) on $url, this article is not backuped !</p>\n"); // The interesting content is in the "js_starterpost" <div> if(!$starterPost = $xpath->query("/html/body//div[@class='js_starterpost']")->item(0)) die("Could not find the 'js_starterpost' div !"); // Extracting the JSON on the div post-tools-wrapper that contains the postId (the unique numeric identfier of each posts) $toolsDiv = $xpath->query(".//div[@class='sc-83o472-2 hNtXBx']", $starterpost)->item(0); $postToolsWrapper = json_decode($xpath->query(".//div[@class='post-tools-wrapper']", $toolsDiv)->item(0)->getAttribute("data-state")); $postId = $postToolsWrapper->postId; if(empty($postId)) die("Could not determine the post Id !"); // Removal of the "js_share-tools" <div> that contains the "Share to Facebook/Twitter/Email/Link" buttons that are of no use $jsShareTools = $xpath->query("./div[@class='js_share-tools']", $toolsDiv)->item(0); $jsShareTools->parentNode->removeChild($jsShareTools); // Retrieving the author name and his Kinja login ; it is not in the first "<a class='sc-1out364-0 hMndXN js_link'>" of the js_starterpost but in the second... if(!$postAuthor = $xpath->query(".//div[@class='sc-1jc3ukb-2 fUsAEy']//a[@class='sc-1out364-0 hMndXN js_link']", $starterPost)->item(0)) die("Could not find the 'js_link' div !"); $postAuthorName = $postAuthor->nodeValue; $postAuthorLogin = preg_replace("|^https://kinja\.com/|", "", $postAuthor->getAttribute("href")); if(empty($postAuthorName) || empty($postAuthorLogin)) die("Could not determine the author name or login !"); if(!preg_match('/^[A-z0-9_-]{2,50}$/', $postAuthorLogin)) die("Invalid author login ($postAuthorLogin) !"); // Checking if the local archive for this article already exists //$archivePath = date("y/n/j", $publishDate); $relativeArchivePath = "/archive/" . $blog . "/" . $postAuthorLogin . "/" . $postId . "/"; $archivePath = __DIR__ . $relativeArchivePath; // If the article has not been archived yet, it needs the data folder to be created if(!is_dir($archivePath) && !mkdir($archivePath, 0750, True)) die("Could not create the archive folder for the article $postId !"); if(file_exists($archivePath . "archived")) { // The page has already been archive, redirecting (301) to it //// TODO: Remove comment for production use //// header("Location: ." . $relativeArchivePath . "article.html"); //// die("Should have redirected to the already generated archive page <a href='.${relativeArchivePath}article.html'>${relativeArchivePath}article.html</a> !"); } // if(!file_exists($archivePath . "datetime.txt")) file_put_contents($archivePath . "datetime.txt", $xpath->query("./meta[@name='publish-date']", $articleHead)->item(0)->getAttribute("content")) or die("Could not retrieve the publish-date"); // Saving the thumbnail of the article in the resolution used by the mainpage/author posts page if(!file_exists($archivePath . "thumbnail")) { // The post thumbnail is on the content attribute of the header <meta property="og:image"> but not in the small resolution used by the main pages, fetching the version that is 265px wide instead (same filename with only the previous part of the URL changing) $thumbnailURL = $xpath->query("./meta[@property='og:image']", $articleHead)->item(0)->getAttribute("content"); // Articles without a thumbnail have their og:image pointing to a placeholder image, no need to fetch it if($thumbnailURL != "https://x.kinja-static.com/assets/images/logos/placeholders/default.png") file_put_contents( $archivePath . "thumbnail", file_get_contents("https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".basename($thumbnailURL)) ) or die("Could not retrieve the thumbnail (https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,g_center,h_149,pg_1,q_60,w_265/".basename($thumbnailURL).") for the article $postId !"); } // Fetches the blog (Oppositelock or LALD for example) favicon if not already in the archive if(!file_exists("./favicon/".$blog.".png")) { // The blog favicon is not present, it should be fetched $favicon = $xpath->query("./link[@rel='shortcut icon']", $articleHead)->item(0); if(!empty($favicon) && $favicon->hasAttribute("href")) { // On some cases... the URL has no protocol defined (starting with //) which does not work with PHP if(substr($favicon->getAttribute("href"), 0, 2) == "//") $url = "https:".$favicon->getAttribute("href"); else $url = &$favicon->getAttribute("href"); file_put_contents(__DIR__ . "/favicon/".$blog.".png", file_get_contents($url) ); } } // Correcting the author avatar, can be in a <img> or a <video> tag if(!$postAuthorAvatarImg = $xpath->query(".//div[@class='sc-1jc3ukb-1 flaMVg']//img[@data-alt='".$postAuthorLogin."']", $starterPost)) die("Could not find the avatar img !"); fixImages($postAuthorAvatarImg, $article, $archivePath, $postAuthorLogin, True); // Video avatars have a specific format... a mp4 and a webm, each on their own child <source> if($postAuthorAvatarVideo = $xpath->query(".//div[@class='sc-1jc3ukb-1 flaMVg']//video[@data-alt='".$postAuthorLogin."']", $starterPost)->item(0)) { foreach($postAuthorAvatarVideo->childNodes as $child) { // TODO: Refactoring with fixImages() $videoSrc = $child->getAttribute("src") or die("Could not retrieve the src of the video avatar of $postAuthorLogin !"); if(!filter_var($videoSrc, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://i\.kinja-img\.com/#', $videoSrc)) die("Invalid image URL ($videoSrc) !"); // Extraction of the file extension $fileExtension = substr($videoSrc, strrpos($videoSrc, "." ) + 1); // This is an avatar, it is saved as ./avatars/$postAuthorLogin.extension if( !file_exists(__DIR__ . "/archive/avatars/${postAuthorLogin}.${fileExtension}") && !file_put_contents(__DIR__ . "/archive/avatars/${postAuthorLogin}.${fileExtension}", file_get_contents($videoSrc)) ) die("Could not retrieve $videoSrc !"); // Setting each <content> src with the one on the archive $child->setAttribute("src", "../../../avatars/${postAuthorLogin}.${fileExtension}"); } // The "poster" attribute of a <video> is the image displayed while the video is loading, no need to bother download it locally $postAuthorAvatarVideo->removeAttribute("poster"); } // Finding the <img> and <video> elements of the post $images = $xpath->query(".//img", $starterPost); $videos = $xpath->query(".//video", $starterPost); // Replacing the <img src=''> blank placeholder by the URL of the biggest image listed on the data-srcset of the <img> tag fixImages($images, $article, $archivePath, $postAuthorLogin); // Same with the <video> elements fixImages($videos, $article, $archivePath, $postAuthorLogin); // Some embedded content are on iframes but pointing to a Kinja URL rather than directly to YouTube or Imgur for example : Replacing it with the direct iframe to the service $iframes = $xpath->query(".//iframe", $starterPost); foreach($iframes as $iframe) { // The Kinja equivalent of the src attribute (the target URL of the iframe) is on the data-src attribute if($datasrc = $iframe->getAttribute("data-src")) { preg_match('#^https://[0-9A-z-]+\.kinja\.com/ajax/inset/iframe\?id=(.+)$#', $datasrc, $params); if(!isset($params[1])) { // This is a "generic" embedding that is not going through the Kinja server (eg: Google Maps) ; the data-src is directly copied on the src if(!$iframe->hasAttribute("src")) $iframe->setAttribute("src", addslashes($datasrc)); } elseif(preg_match('#^youtube-video-([^&\#]+)&?(.*?)$#', $params[1], $youtubeParams)) { // The first match will contain the youtubeVideoID and the second the optional parameters such as start=X ; adding the "src" attribute to the iframe with the non-Kinja embedding $iframe->setAttribute("src", "https://www.youtube.com/embed/".addslashes($youtubeParams[1])."?".addslashes($youtubeParams[2])); } elseif(preg_match('#^youtube-list-(.+?)%2F.*?(&.+)?$#', $params[1], $youtubeParams)) { // Seems like youtube-list is also used... for single videos (video ID before the %2F)... // $iframe->setAttribute("src", "https://www.youtube.com/embed?listType=playlist&list=".addslashes($youtubeParams[1])); $iframe->setAttribute("src", "https://www.youtube.com/embed/".addslashes($youtubeParams[1])."?".addslashes($youtubeParams[2])); } elseif(preg_match('#^twitter-([0-9]+)&(.*)$#', $params[1], $twitterParams)) { $iframe->setAttribute("src", "https://platform.twitter.com/embed/index.html?dnt=true&id=".addslashes($twitterParams[1])."&".addslashes($twitterParams[2])); // Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code if($iframe->getAttribute("height") < 520) $iframe->setAttribute("height", 520); } elseif(preg_match('#^instagram-([^&]+)&?(.*)$#', $params[1], $instagramParams)) { $iframe->setAttribute("src", "https://www.instagram.com/p/".addslashes($instagramParams[1])."/embed/"); // Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code if($iframe->getAttribute("height") < 802) $iframe->setAttribute("height", 802); } elseif(preg_match('#^vimeo-(.+)$#', $params[1], $vimeoParams)) { $iframe->setAttribute("src", "https://player.vimeo.com/video/".addslashes($vimeoParams[1])); } elseif(preg_match('#^imgur-(a%2F)?([^&]+)#', $params[1], $imgurParams)) { if($imgurParams[1] == "a%2F") $iframe->setAttribute("src", "https://imgur.com/a/".addslashes($imgurParams[2])."/embed?pub=true&w=540"); else $iframe->setAttribute("src", "https://imgur.com/".addslashes($imgurParams[2])."/embed?pub=true&w=540"); // Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code if($iframe->getAttribute("height") < 500) $iframe->setAttribute("height", 500); } elseif(preg_match('#^fbpost-(https%3A%2F%2Fwww.facebook.com%2F.+)$#', $params[1], $facebookParams)) { // Facebook post $iframe->setAttribute("src", "https://www.facebook.com/plugins/post.php?href=".addslashes($facebookParams[1])); // Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code if($iframe->getAttribute("height") < 750) $iframe->setAttribute("height", 750); } elseif(preg_match('#^fb-([0-9]+)#', $params[1], $facebookParams)) { // Facebook video $iframe->setAttribute("src", "https://www.facebook.com/v2.3/plugins/video.php?allowfullscreen=true&app_id=&container_width=636&href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D".$facebookParams[1]."&sdk=joey"); } elseif(preg_match('#^dm-(.*)$#', $params[1], $dailymotionParams)) { // DailyMotion video $iframe->setAttribute("src", "https://www.dailymotion.com/embed/video/".addslashes($dailymotionParams[1])); } elseif(preg_match('#^soundcloud-(.*)$#', $params[1], $soundcloudParams)) { // Soundcloud track $iframe->setAttribute("src", "https://w.soundcloud.com/player/?url=https%3A//api.soundcloud.com/tracks/".urlencode(addslashes($soundcloudParams[1]))."&auto_play=false&hide_related=false&show_comments=true&show_user=true&show_reposts=false&visual=true&show_playcount=true"); } elseif(preg_match('#^tiktok-([^&]+)#', $params[1], $tiktokParams)) { // TikTok post $iframe->setAttribute("src", "https://www.tiktok.com/embed/".addslashes($tiktokParams[1])); // Dirty tempfix to limit the cropping as autosizing is not working without the proper JS code if($iframe->getAttribute("height") < 750) $iframe->setAttribute("height", 750); } elseif(preg_match('#^vine-([^&]+)#', $params[1], $vineParams)) { $iframe->setAttribute("src", "https://vine.co/v/".addslashes($vineParams[1])."/embed/postcard"); } elseif(preg_match('#^polldaddy-tag-([^&]+)#', $params[1], $polldaddyParams)) { $iframe->setAttribute("src", "https://poll.fm/".addslashes($polldaddyParams[1])."/embed"); } elseif(preg_match('#^tumblr-post-([^&]+)#', $params[1], $tumblrParams)) { $iframe->setAttribute("src", "https://embed.tumblr.com/embed/post/4pJgfiHapNNMJEYz1h7DBw/".addslashes($tumblrParams[1])."?width=542&language=en_US"); // TODO: Fix Tumblr embedding ! echo "<p>Tumblr embedding is not supported at the moment (post $postId) !</p>"; $error = True; } elseif(preg_match('#^twitch-stream-([^&]+)#', $params[1], $twitchParams)) { // Twitch stream iframe : Requires HTTPS to work and to pass the domain of the page as the parent parameter $iframe->setAttribute("src", "https://player.twitch.tv/?channel=".addslashes($twitchParams[1])."&parent=".$_SERVER["HTTP_HOST"]); } else { die("Unsupported iframe embedding : ".$params[1]." !"); } } } // <figure> elements $figures = $xpath->query(".//figure", $starterpost); foreach($figures as $figure) { // Images are inside of <figure> that has the width of the image defined on the style tag of it, breaking the centering of the images ! if( $figure->hasAttribute("class") && preg_match('/(^| )align--center($| )/', $figure->getAttribute("class")) ) { // This image is supposed to be centered, removing it's problematic style="width: *px;" tag $figure->removeAttribute("style"); } } // Image galleries have the photos on a <picture> container $pictures = $xpath->query(".//picture", $starterpost); // TODO: Refactoring with fixImages() foreach($pictures as $picture) { // The images are both in a <source> and an <img> container, the best resolution picture is not always on the same though... $maxResolution = $imgSrc = NULL; // Looping on each element (normally one <source> and one <img> to find the biggest resolution available foreach($picture->childNodes as $child) { // <source> have the image URL on their "srcset" tag while <img> have on their "src" tag if($child->hasAttribute("src")) $childSrc = $child->getAttribute("src"); elseif($child->hasAttribute("srcset")) $childSrc = $child->getAttribute("srcset"); else die("Could not determine any src for the image inside a <picture> !"); // The resolution is part of the URL, using a regex to extract it // if(!preg_match('#_([0-9]+)/[A-z0-9.-_]+$#', $childSrc, $imgResolution)) die("Could not determine the resolution of $childSrc"); if(!preg_match('#(_([0-9]+)|/upload)/[A-z0-9.-_]+$#', $childSrc, $imgResolution)) die("Could not determine the resolution of $childSrc"); // The resolution in the srcset is for example "320w" for an image of a width of 320px ; they are not ordered, looping on each to find the one with the biggest resolution is required if($imgResolution[1] == "/upload" || $imgResolution[2] > $maxResolution) { // This image is the biggest so far $imgSrc = $childSrc; $maxResolution = $imgResolution[2]; // The image is on a format without the size, meaning that only one image size is available, no need to continue if($imgResolution[1] == "/upload") break; } } if(!empty($imgSrc)) { if(!filter_var($imgSrc, FILTER_VALIDATE_URL, array(FILTER_FLAG_PATH_REQUIRED, FILTER_FLAG_HOST_REQUIRED)) || !preg_match('#^https://i\.kinja-img\.com/#', $imgSrc)) die("Invalid image URL ($imgSrc) !"); // Keeping the same filename as on te Kinja server $imgFilename = basename($imgSrc); // Downloading the image if( !file_exists($archivePath . $imgFilename) && !file_put_contents($archivePath . $imgFilename, file_get_contents($imgSrc)) ) die("Could not retrieve $imgSrc !"); // TODO: Find a solution to display the galleries on the pages // Replacing the image srcset with the one on the archive on both <source> and <img> ; using src on a <source> is not supported anymore foreach($picture->childNodes as $child) { // $child->setAttribute("src", "./" . $imgFilename); // $child->removeAttribute("srcset"); $child->setAttribute("srcset", "./" . $imgFilename); $child->removeAttribute("src"); } } else die("Could not determine the URL of a <picture> !"); } // Extracting the HTML code of the header containing the poster name, the date of the post and views and comments count which is on the first <div> inside of js_starterpost $postHeader = $starterPost->childNodes->item(0)->C14N(); // The article itself is the second one (<div class="js_post-content">) $postContent = $starterPost->childNodes->item(1)->C14N(); // HTML code of the new "article.html" $newPage = '<!DOCTYPE html><html lang="en-us" data-reactroot=""> <head> <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0,maximum-scale=10.0"> <meta charset="utf-8"> <!-- Copies of the Kinja CSSs --> <!-- <link href="/oppo/extractor/style1-unique.css" rel="stylesheet" data-styled="" data-styled-version="5.1.0">--> <link href="./style-inline.css" rel="stylesheet" data-styled="" data-styled-version="5.1.0"> <link href="/oppo/extractor/style2.css" rel="stylesheet"> <link href="/oppo/extractor/style3.css" rel="stylesheet" id="509bc78f-a82c-4eb8-9919-3bd416e0c2ca"> <link rel="shortcut icon" type="image/png" href="' . "/oppo/extractor/favicon/${blog}.png" .'"> <!-- Those CSS/JS are not from Kinja --> <style class="Kinja-Extracted">/*div.js_post-content img, div.image-hydration-wrapper { max-width: 800px; }*/ aside img { width: 160px; height: 90px; } aside a { width: 160px !important; height: 110px !important; }</style> <title>' . $pageTitle . '
' . $postTitle . '
' . $postHeader . "\n" . $postContent . '
'; //