Porting my blog for the second time, walk the old data part 4 |
Porting my blog for the second time, setting up Apache2 |
Porting my blog for the second time, finished loading old blog data
This is post #17 of my series about how I port this blog from Blogengine.NET 2.5 ASPX on a Windows Server 2003 to a Linux Ubuntu server, Apache2, MySQL and PHP. A so called LAMP. The introduction to this project can be found in this blog post /post/Porting-my-blog-for-the-second-time-Project-can-start.
Now when I finished loading the data I would like to celebrate this by posting my entire complete perl program reading the XML and loading it into the database.
#!/usr/bin/perl use strict; use warnings; use XML::LibXML; # apt-get install libxml-libxml-perl use DBI; # apt-get install libdbd-mysql-perl use Switch; #apt-get install libswitch-perl use URI::Encode qw(uri_encode uri_decode); #apt-get install liburi-encode-perl use LWP::Simple; use HTML::Entities # apt-get install libhtml-parser-perl binmode STDIN, ':encoding(UTF-8)'; # Database connection my $dbh = DBI-≻connect('dbi:mysql:database=jensblog;#host=localhost','randomuser','somepassword',{AutoCommit=≻1,RaiseError=≻1,PrintError=≻1}); $dbh-≻do( "set names utf8" ); my $deleteDataBase = 1; my $insertCategories = 1; my $insertPost = 1; my $insertPostCategories = 1; my $loadFeedback = 1; my $loadTags = 1; my $downloadImages = 1; # Delete data from the tables # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-tags.aspx if ($deleteDataBase) { $dbh-≻do('delete from PostTag'); $dbh-≻do('delete from PostCategory'); $dbh-≻do('delete from PostURL'); $dbh-≻do('delete from URL'); $dbh-≻do('delete from Tag'); $dbh-≻do('delete from Category'); $dbh-≻do('delete from Feedback'); $dbh-≻do('delete from Post'); $dbh-≻do('ALTER TABLE Post AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE PostTag AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE PostURL AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE PostCategory AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE URL AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE Tag AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE Category AUTO_INCREMENT = 1'); $dbh-≻do('ALTER TABLE Feedback AUTO_INCREMENT = 1'); } # For detection of duplicate slug names. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx my %dictSlugToID = (); # Load categories # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data.aspx my $filename = "/usr/local/bin/jensblog/App_Data/categories.xml"; my $parser = XML::LibXML-≻new("1.0", "UTF-8"); my $xmldoc = $parser-≻parse_file($filename); my %dictGUIDToID = (); my %dictTargetAndTagNameToURLID = (); if ($insertCategories) { for my $category_node ($xmldoc-≻findnodes("//category")) { my $strGuidId = $category_node -≻ findnodes('@id') -≻ to_literal; my $strCategoryName = $category_node-≻textContent; $dbh-≻do('INSERT INTO Category (Name) VALUES (?)', undef, $strCategoryName); my $intId = $dbh-≻{mysql_insertid}; $dictGUIDToID{ $strGuidId } = $intId; print $strGuidId . " " . $strCategoryName. " " . $intId . " "; } } Walk('/usr/local/bin/jensblog/App_Data/posts'); # LoadPost('/usr/local/bin/jensblog/App_Data/posts/afe00c46-b999-4305-8ca5-7d7f5ce9ab23.xml'); # LoadPost('/usr/local/bin/jensblog/App_Data/posts/302eaa17-dca5-427f-af96-c39bdb428c41.xml'); # LoadPost('/usr/local/bin/jensblog/App_Data/posts/8d4c8839-f196-480c-a1d7-7f0fb53b9e2f.xml'); # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-2.aspx # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data.aspx sub Walk { my ($directory) = @_; opendir my $dirHandle, $directory or die "Failed to open $directory: $!"; my @dirItems = readdir $dirHandle; for my $dirItem (@dirItems) { if ($dirItem =~ /^(.|..)$/) { next; } if ($dirItem =~ /_vti_cnf$|themes$/) { next; } # Skip this file. The welcome message of the BlogEngine. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx if ($dirItem =~ /c3b491e5-59ac-4f6a-81e5-27e971b903ed.xml/) { next; } my $path = $directory . "/" . $dirItem; if (-f $path && $path =~ /.xml$/i) { LoadPost($path); } if (-d $path) { Walk($path); } } } # Walk() # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-2.aspx sub LoadPost { my ($filenamePost) = @_; print "==============================="; print "========== LOAD POST =========="; print "==============================="; print "File: " . $filenamePost . ""; # Load a post fields # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-load-a-post.aspx #my $filenamePost = "/usr/local/bin/jensblog/App_Data/posts/6d781b81-27ab-4ecf-a4ce-df2795128f0d.xml"; $xmldoc = $parser-≻parse_file($filenamePost); my $strEncoding = $xmldoc-≻encoding(); print "Encoding: " . $strEncoding . ""; my %dictStringFieldToValue = (); my %dictBoolFieldToValue = (); my $postID = 0; if ($insertPost) { for my $node ($xmldoc-≻findnodes("/post/*")) { if ($node -≻ nodeName() =~ /author|title|description|slug|content|pubDate|lastModified/) { $dictStringFieldToValue{$node -≻ nodeName()} = $node-≻textContent(); } elsif ($node -≻ nodeName() =~ /ispublished|isdeleted|iscommentsenabled/) { my $boolValue = 0; if ($node-≻textContent() eq "True") { $boolValue = 1; } $dictBoolFieldToValue{$node -≻ nodeName()} = $boolValue; } } # Clean up the slug. Resolve duplicate slug names. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx $dictStringFieldToValue{"slug"} =~ s/^-?(.*)-?$/$1/; $dictStringFieldToValue{"title"} =~ s/^s?(.*)s?$/$1/; my $iSlugCounter = 2; while (exists ($dictSlugToID{$dictStringFieldToValue{"slug"}})) { $dictStringFieldToValue{"slug"} = $dictStringFieldToValue{"slug"} . "-" . $iSlugCounter; $iSlugCounter++; } print "Slug: " . $dictStringFieldToValue{"slug"} . ""; $dbh-≻do( 'INSERT INTO Post (Author,Title,Description,Slug,Content,' . 'IsPublished,IsDeleted,IsCommentsEnabled,' . 'PublishedOn,ModifiedOn) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', undef, $dictStringFieldToValue{"author"}, $dictStringFieldToValue{"title"}, $dictStringFieldToValue{"description"}, $dictStringFieldToValue{"slug"}, $dictStringFieldToValue{"content"}, $dictBoolFieldToValue{"ispublished"}, $dictBoolFieldToValue{"isdeleted"}, $dictBoolFieldToValue{"iscommentsenabled"}, $dictStringFieldToValue{"pubDate"}, $dictStringFieldToValue{"lastModified"} ); $postID = $dbh-≻{mysql_insertid}; # Remember this slug. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx $dictSlugToID{$dictStringFieldToValue{"slug"}} = $postID; print "----------------------------- Post ID was: " . $postID . " ----------------------------- "; } # Decode entities # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-4.aspx $dictStringFieldToValue{"content"} = decode_entities($dictStringFieldToValue{"content"}, "200-377"); my $strTagName = ""; my $strWidth = ""; my $strHeight = ""; my $strTarget = ""; my $bIsLocalURL = 0; my $strFileName = ""; my $strStyle = ""; my $bIsFileContent = 0; my $bIsAlreadyLoadedInAnotherPost = 0; my %dictURLToDatabaseID = (); print "Process URLs"; # URL and Image processing # Here we parse the post content. Extract URLs and store these in the URL entity. # Download images to be stored locally. Find and replace all URLs with URL place holders. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-1.aspx my $iUrlCount = 0; my $content = $dictStringFieldToValue{"content"}; my $contentParsed = ""; my $contentResult = ""; while ($content =~ /^(.*?)(≺a.+?≻|≺img.+?/≻)/msi) { print "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"; print "~-~-~-~-~-~ PROCESS URL ~-~-~-~-~-~-"; print "~-~-~-~-~-~-~- START -~-~-~-~-~-~-~-"; $iUrlCount++; $content = substr($content, length($1 . $2)); $contentParsed .= $1; $contentResult .= $1; # Wait with saving the tag until after it has been replaced with a place holder. my $tag = $2; print "Tag at offset : " . length($contentParsed =~ s/ / /r) . ""; $contentParsed .= $tag; $bIsAlreadyLoadedInAnotherPost = 0; # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-4.aspx $bIsLocalURL = $tag =~ /(malmgren.nl|blogspot.com|googleusercontent.com|/image)/i; $bIsFileContent = $tag =~ /.(pdf|jpg|gif|png)/i; if ($tag =~ /^≺([a-z]+)/i) { $strTagName = lc($1); } if ($tag =~ /widths*=s*"(.+?)"/i) { $strWidth = $1; } else { $strWidth = ""; } if ($tag =~ /heights*=s*"(.+?)"/i) { $strHeight = $1; } else { $strWidth = ""; } if ($tag =~ /styles*=s*"(.*?)"/i) { $strStyle = $1; } else { $strStyle = ""; } # Replace an URL with an URL place holder. Store the URL in dictURLToDatabaseID so that # later on it is possible to search for URLs a second time. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-3.aspx my $iDatabaseIDofThisURLentry = -1; if ($tag =~ /(.*?)(https?://|/image)(.+?)["']/i) { my $strParsedURL = $2.$3; $strTarget = uri_decode($strParsedURL); $strTarget =~ s//([iI])mage/http://www.jens.malmgren.nl/$1mage/; if (exists $dictTargetAndTagNameToURLID{$strTarget . "|" . $strTagName}) { print "Already present: " . $strTarget . "|" . $strTagName. ""; # URL record already available for this URL $iDatabaseIDofThisURLentry = $dictTargetAndTagNameToURLID{$strTarget . "|" . $strTagName}; $bIsAlreadyLoadedInAnotherPost = 1; } else { # Insert the URL record. Get the ID of it for replacing URLs with place holders such as /media/PICT3080_s1600.JPG # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx $dbh-≻do( 'INSERT INTO URL (TagName, Width, Height, IsLocalURL, Style, IsFileContent ) ' . 'VALUES (?, ?, ?, ?, ?, ? )' , undef, $strTagName,$strWidth,$strHeight,$bIsLocalURL,$strStyle,$bIsFileContent); $dictTargetAndTagNameToURLID{$strTarget . "|" . $strTagName} = $iDatabaseIDofThisURLentry = $dbh-≻{mysql_insertid}; } $dictURLToDatabaseID{$strTarget} = $iDatabaseIDofThisURLentry; $tag =~ s/^(.*?)($strParsedURL)(.*?)$/$1{URL:$iDatabaseIDofThisURLentry}$3/; } # Change the image filename to indicate the resolution of the image. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-4.aspx $strFileName = $strTarget; if ($strFileName =~ s/(.*?/)(s[0-9]+)(/)([a-z0-9]+)(.[a-z]{3,4})$/$4_$2$5/i) { print "Transformed filename: " . $strFileName . ""; } else { $strFileName = $strTarget; $strFileName =~ s/(.*?/)([^/]+$)/$2/; } # Is the URL pointing to a file to be downloaded? # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-6.aspx # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-4.aspx my $strFullPathAndImageFileName = "/usr/local/bin/images/" . $strFileName; if ($bIsLocalURL && $bIsFileContent && $downloadImages && !$bIsAlreadyLoadedInAnotherPost && !-f $strFullPathAndImageFileName) { print "Get target " . $strTarget . " save to " . $strFileName . ""; my $data = LWP::Simple::get($strTarget); open (FH, "≻$strFullPathAndImageFileName"); binmode (FH); print FH $data; close (FH); } $contentResult .= $tag; # Tag is processed. Here we add it. # Update the FileName and Target of the current URL, # Connect the Post with this URL by creating the PostURL record. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx if (!$bIsAlreadyLoadedInAnotherPost) { $dbh-≻do('UPDATE URL SET Target = ?, Filename = ? WHERE ID = ?' , undef, $strTarget, $strFileName , $iDatabaseIDofThisURLentry); } # Ignore duplicates of links between a Post and URL because that indicated that the same URL is used twize in a post. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-walk-the-old-data-part-3.aspx eval { $dbh-≻do( 'INSERT INTO PostURL (URLID, PostID )' . 'VALUES (?, ? )' , undef, $iDatabaseIDofThisURLentry,$postID); }; print "PostURL already available. PostID: " . $postID . ", URLID: " . $iDatabaseIDofThisURLentry . ". Message: $@ " if $@; print "iDatabaseIDofThisURLentry " . $iDatabaseIDofThisURLentry. " "; print "postID " . $postID . " "; print "iUrlCount " . $iUrlCount . " "; print "strTagName " . $strTagName . " "; print "strWidth " . $strWidth . " "; print "strHeight " . $strHeight . " "; print "strTarget " . $strTarget . " "; print "bIsLocalURL " . $bIsLocalURL . " "; print "strStyle " . $strStyle . " "; print "strFileName " . $strFileName . " "; print "bIsFileContent " . $bIsFileContent . " "; } print "~-~-~-~-~-~-~-~ END ~-~-~-~-~-~-~-~- "; print "~-~-~-~-~-~ PROCESS URL ~-~-~-~-~-~- "; print "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~- "; $contentResult .= $content; # Add any text after the last tag. # Post processing the content of the post to find any remaining URLs in the text. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-3.aspx foreach my $strParsedURL ( keys %dictURLToDatabaseID ) { print "URL: $strParsedURL, DatabaseID: $dictURLToDatabaseID{$strParsedURL} "; $contentResult =~ s/$strParsedURL/{URL:$dictURLToDatabaseID{$strParsedURL}}/g; } # Here all URLs of the content replaced with place holders. Now it can be updated to the database. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-images-part-5.aspx $dbh-≻do('UPDATE Post SET Content = ? WHERE ID = ?' , undef, $contentResult , $postID); # Load categories of a post. Connect the Category entity to this Post through the PostCategory entity. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-restart.aspx if ($insertPostCategories) { for my $node ($xmldoc-≻findnodes("/post/categories/category")) { print "Category: " . $node-≻textContent() . " " . $dictGUIDToID{$node-≻textContent()} . " "; $dbh-≻do('INSERT INTO PostCategory (PostID,CategoryID) VALUES (?, ?)' , undef, $postID, $dictGUIDToID{$node-≻textContent()} ); } } # Load tags of a post. Connect the Tags entity to this Post through the PostTag entity. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-tags.aspx if ($loadTags) { for my $node ($xmldoc-≻findnodes("/post/tags/tag")) { print "Loaded tag: " . $node-≻textContent() . " "; my $sth = $dbh -≻ prepare('SELECT ID FROM Tag WHERE Name = ?'); $sth-≻execute( $node-≻textContent() ); my $tagID = "-"; if(my $row = $sth-≻fetchrow_hashref()) { $tagID = $$row{"ID"}; print "Tag found in database: " . $node-≻textContent() . " with ID: " . $tagID ." "; } if ($tagID eq "-") { print "Tag not found in database: " . $node-≻textContent() . " "; $dbh-≻do('INSERT INTO Tag (Name) VALUES (?)' , undef, $node-≻textContent() ); $tagID = $dbh-≻{mysql_insertid}; } $dbh-≻do('INSERT INTO PostTag (PostID,TagID) VALUES (?, ?)' , undef, $postID, $tagID ); } } # Load feedback. # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-comments.aspx if ($loadFeedback) { for my $node ($xmldoc-≻findnodes("/post/comments/comment")) { print "Feedback on: " . getField("date", $node) . " "; $dbh-≻do( 'INSERT INTO Feedback (PostID, Date, Author, Email, IP, Content, IsApproved, IsDeleted, IsSpam)' . 'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ? )' , undef, $postID, getField("date", $node), getField("author", $node), getField("email", $node), getField("ip", $node), getField("content", $node), getBooleanAttribute("approved", $node), getBooleanAttribute("deleted", $node), getBooleanAttribute("spam", $node) ); } } } # LoadPost() # Used to load True/False attributes in feedback # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-comments.aspx sub getBooleanAttribute { my ($ip_strName, $node) = @_; my $result = 0; if ($node-≻getAttribute($ip_strName) eq "True") { $result = 1; } return $result; } # getBooleanAttribute() # Used to load feedback data # http://www.jens.malmgren.nl/post/Porting-my-blog-for-the-second-time-comments.aspx sub getField { my ($ip_strName, $node) = @_; my @fieldNodeList = $node -≻ findnodes($ip_strName); if (scalar @fieldNodeList ≻ 0) { return $fieldNodeList[0] -≻ textContent(); } else { return ""; } } # getField()
My next project will be to manipulate the webserver.
I was born 1967 in Stockholm, Sweden. I grew up in the small village Vågdalen in north Sweden. 1989 I moved to Umeå to study Computer Science at University of Umeå. 1995 I moved to the Netherlands where I live in Almere not far from Amsterdam.
Here on this site I let you see my creations.
I create, that is my hobby.