Exploring the Jordanian Twittersphere¶

Data Collection¶

  • We used the X (previously twitter)'s academic API to pull data from three different geolocations in Jordan: Amman, Zarqa, Irbid
In [ ]:
import tweepy
import pandas as pd

BEARER_TOKEN = "your_bearer_token_here"

client = tweepy.Client(bearer_token=BEARER_TOKEN)

amman_geo = "35.9239,31.9522,30km"
irbid_geo = "37.2166,37.1666,30km"
zarqa_geo = "36.0833,32.7167,30km"

locations = {
    'amman': amman_geo,
    'irbid': irbid_geo, 
    'zarqa': zarqa_geo
}

for city, geo in locations.items():
    query = f"point_radius:[{geo}]"
    
    tweets = client.search_all_tweets(
        query=query,
        start_time="2023-05-01T00:00:00Z",
        tweet_fields=['created_at', 'lang', 'author_id', 'geo', 'entities', 
                     'public_metrics', 'referenced_tweets', 'in_reply_to_user_id'],
        expansions=['author_id', 'geo.place_id', 'referenced_tweets.id', 
                   'referenced_tweets.id.author_id'],
        place_fields=['country', 'country_code', 'full_name', 'geo', 'name'],
        user_fields=['location', 'username']
    )

    tweets_data = []
    for tweet in tweets.data:
        user = next((u for u in tweets.includes['users'] if u.id == tweet.author_id), None)
        
        tweet_dict = {
            'screen_name': user.username if user else None,
            'user_location': user.location if user else None,
            'id_str': str(tweet.id),
            'created_at': tweet.created_at,
            'favorite_count': tweet.public_metrics['like_count'],
            'retweet_count': tweet.public_metrics['retweet_count'],
            'text': tweet.text,
            'geo': tweet.geo['coordinates'] if tweet.geo else None,
            'coordinates': tweet.geo['coordinates'] if tweet.geo else None,
            'entities': tweet.entities,
            'place': tweet.geo['place_id'] if tweet.geo else None,
            'in_reply_to_screen_name': user.username if tweet.in_reply_to_user_id else None,
            'in_reply_to_user_id': tweet.in_reply_to_user_id,
            'in_reply_to_status_id_str': None,
            'is_quote_status': tweet.is_quote_status,
            'lang': tweet.lang,
            'quoted_status_id_str': tweet.quoted_status_id_str,
            'retweeted_status_id_str': tweet.retweeted_status_id_str,
        }
        tweets_data.append(tweet_dict)
    
    df = pd.DataFrame(tweets_data)
    df.to_csv(f'tweets_{city}.csv', index=False)
In [ ]:
amman_df = pd.read_csv('tweets_amman.csv', dtype=str)
irbid_df = pd.read_csv('tweets_irbid.csv', dtype=str)
zarqa_df = pd.read_csv('tweets_zarqa.csv', dtype=str)
In [ ]:
# Add retweet columns to each dataframe
for df in [amman_df, irbid_df, zarqa_df]:
    df['is_a_retweet'] = df['retweeted_status_id_str'].notna()
    df['retweet_created_at'] = df['created_at'] if df['is_a_retweet'].any() else None
    df['retweet_lang'] = df['lang'] if df['is_a_retweet'].any() else None
    df['retweet_place'] = df['place'] if df['is_a_retweet'].any() else None
    df['retweet_geo'] = df['geo'] if df['is_a_retweet'].any() else None
    df['retweet_coordinates'] = df['coordinates'] if df['is_a_retweet'].any() else None
    df['retweet_text'] = df['text'] if df['is_a_retweet'].any() else None
    df['retweet_count.1'] = df['retweet_count'] if df['is_a_retweet'].any() else None
    df['in_reply_to_screen_name.1'] = df['in_reply_to_screen_name'] if df['is_a_retweet'].any() else None

# Save updated dataframes
amman_df.to_csv('tweets_amman.csv', index=False)
irbid_df.to_csv('tweets_irbid.csv', index=False) 
zarqa_df.to_csv('tweets_zarqa.csv', index=False)

Data Processing¶

  • tweet cleaning
In [ ]:
import re

class Cleaner:
    repeated_characters_pattern = re.compile(r'(.)\1{1,}')
    arabic_diacritics_pattern = re.compile(r"[\u064B-\u0652\u0610-\u061A]")
    unwanted_chars_pattern = re.compile( r"«|»|\"|\'|\xa0|'|”|“|ـ|\u202c|\u202e")
    rt_pattern = re.compile(r"^(?=RT)(.{2})")
    mention_pattern = re.compile(r"@[a-zA-Z_0-9]+:?")
    link_pattern = re.compile(r"https?://\S+") 
    repeated_words_pattern = re.compile(r'(\w{2,})\1{1,}')
    
    def __init__(self, limit=2):
        pass
    
    @staticmethod
    def clean(text, limit = 2):
        text = Cleaner.repeated_characters_pattern.sub(lambda match: match.group(1) * limit, text)
        text = Cleaner.unwanted_chars_pattern.sub('', text)
        text = Cleaner.arabic_diacritics_pattern.sub('', text)
        text = Cleaner.repeated_words_pattern.sub(lambda match: match.group(1), text)
        text = Cleaner.rt_pattern.sub('', text)
        text = Cleaner.mention_pattern.sub('', text)
        text = Cleaner.link_pattern.sub('', text)
    
        return text
        
In [ ]:
amman_df['text_clean'] = amman_df['text'].apply(Cleaner.clean)
irbid_df['text_clean'] = irbid_df['text'].apply(Cleaner.clean)
zarqa_df['text_clean'] = zarqa_df['text'].apply(Cleaner.clean)

amman_df.to_csv('tweets_amman.csv', index=False)
irbid_df.to_csv('tweets_irbid.csv', index=False)
zarqa_df.to_csv('tweets_zarqa.csv', index=False)
  • Normalizing date
In [ ]:
from datetime import datetime

def convert_datetime(date):
    try:
        datetime_obj = datetime.strptime(date, '%a %b %d %H:%M:%S %z %Y')
        formatted_date = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
        return formatted_date
    except:
        return None

amman_df['created_at_datetime'] = amman_df['created_at'].apply(convert_datetime)
irbid_df['created_at_datetime'] = irbid_df['created_at'].apply(convert_datetime)
zarqa_df['created_at_datetime'] = zarqa_df['created_at'].apply(convert_datetime)
  • Running language detection (on top of twitter's language detection)
In [ ]:
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# Run language detection on cleaned text
amman_df['lang_detected'] = amman_df['text_clean'].apply(detect_language)
irbid_df['lang_detected'] = irbid_df['text_clean'].apply(detect_language) 
zarqa_df['lang_detected'] = zarqa_df['text_clean'].apply(detect_language)
  • taking only retweets
In [ ]:
amman_df_rt_only = amman_df[amman_df['is_a_retweet']]
irbid_df_rt_only = irbid_df[irbid_df['is_a_retweet']]
zarqa_df_rt_only = zarqa_df[zarqa_df['is_a_retweet']]
  • downloading images from tweets
In [ ]:
import requests
import os

def download_image(url, save_path):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
    except:
        return False
    return False


os.makedirs('images/amman', exist_ok=True)
os.makedirs('images/irbid', exist_ok=True) 
os.makedirs('images/zarqa', exist_ok=True)


for idx, row in amman_df.iterrows():
    if pd.notna(row.get('media_url')):
        filename = f"images/amman/{row['id']}.jpg"
        download_image(row['media_url'], filename)


for idx, row in irbid_df.iterrows():
    if pd.notna(row.get('media_url')):
        filename = f"images/irbid/{row['id']}.jpg"
        download_image(row['media_url'], filename)


for idx, row in zarqa_df.iterrows():
    if pd.notna(row.get('media_url')):
        filename = f"images/zarqa/{row['id']}.jpg"
        download_image(row['media_url'], filename)
  • OCRing images
In [ ]:
import http.client, urllib.request, urllib.parse, urllib.error, base64
from dotenv import load_dotenv
import os
import json
import pandas as pd

def ocr_azure(subscription_key, image_url, feature_type):
    headers = {
        # Request headers
        'Content-Type': 'application/json',
        'Ocp-Apim-Subscription-Key': subscription_key,
    }

    params = urllib.parse.urlencode({
        # Request parameters
        'features': feature_type,
        'gender-neutral-caption': 'False',
    })

    body = "{'url': '%s'}" % image_url

    try:
        conn = http.client.HTTPSConnection('*.cognitiveservices.azure.com')
        conn.request("POST", "/computervision/imageanalysis:analyze?api-version=2023-02-01-preview&%s" % params, body, headers)
        response = conn.getresponse()
        data = response.read()
        conn.close()
        return data
    except Exception as e:
        print("[Errno {0}] {1}".format(e.errno, e.strerror))






subscription_key = 'AZURE_SUBSCRIPTION_KEY'

# Process images from both directories
def process_directory(directory, feature_type='ocr'):
    results = []
    for filename in os.listdir(directory):
        if filename.endswith('.jpg'):
            image_path = os.path.join(directory, filename)
            tweet_id = filename.replace('.jpg', '')
            

            image_url = f"file://{os.path.abspath(image_path)}"
            

            ocr_result = ocr_azure(subscription_key, image_url, feature_type)
            
            if ocr_result:
                results.append({
                    'tweet_id': tweet_id,
                    'ocr_text': json.loads(ocr_result.decode('utf-8'))
                })
                
    return pd.DataFrame(results)


zarqa_results = process_directory('images/zarqa')
amman_results = process_directory('images/amman') 
irbid_results = process_directory('images/irbid')


zarqa_results.to_csv('zarqa_ocr_results.csv', index=False)
amman_results.to_csv('amman_ocr_results.csv', index=False)
irbid_results.to_csv('irbid_ocr_results.csv', index=False)
  • adding ocr text to the dataframe
In [ ]:
zarqa_ocr = pd.read_csv('zarqa_ocr_results.csv')
amman_ocr = pd.read_csv('amman_ocr_results.csv')
irbid_ocr = pd.read_csv('irbid_ocr_results.csv')

all_ocr = pd.concat([zarqa_ocr, amman_ocr, irbid_ocr])

ocr_dict = dict(zip(all_ocr['tweet_id'], all_ocr['ocr_text']))


df['image_content'] = df['id'].map(ocr_dict)
df['image_content'] = df['image_content'].fillna('')
  • generate user retweet network
In [ ]:
all_retweets = pd.concat([amman_df_rt_only, irbid_df_rt_only, zarqa_df_rt_only])

network_data = []
for _, tweet in all_retweets.iterrows():
    if pd.notna(tweet['screen_name']) and pd.notna(tweet['in_reply_to_screen_name']):
        network_data.append({
            'source': tweet['screen_name'],
            'target': tweet['in_reply_to_screen_name'],
            'lang': tweet['lang_detected']
        })

network_df = pd.DataFrame(network_data)
network_df = network_df.groupby(['source', 'target']).size().reset_index(name='weight')

network_df.to_csv('jordan_twittersphere_retweet_network.csv', index=False)

User Network Analysis using Cytoscape¶

In this section, we take the retweet network and upload it to cytoscape for visualization and analysis.

  • we color user nodes based on the language they speak (Arabic or English or Other)

  • we run community detection on the network (using the louvain algorithm)

We import detected communities per user and tweet back here

In [ ]:
communities_df = pd.read_csv('jordan_twittersphere_communities_louvain.csv')
  • we generate summaries for the top 20 communities using GPT-4
In [ ]:
from openai import OpenAI
import pandas as pd

client = OpenAI()

def get_openai_response(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content

# Get top 20 communities by size
communities = communities_df.value_counts("community")
top_20_communities = communities.sort_values(ascending=False).head(20)

# Create empty list to store summaries
community_summaries = []

# Prepare prompts for GPT-4
for community in top_20_communities.index:
    # Get tweets from this community
    community_tweets = all_retweets[all_retweets["screen_name"].isin(
        communities_df[communities_df["community"] == community]["username"]
    )]
    
    # Format tweets for prompt
    tweets_text = community_tweets["text"].tolist()
    tweets_sample = tweets_text[:50]  # Take first 50 tweets as sample
    
    prompt = f"""Analyze these tweets from a Twitter community in Jordan. 
    Provide a brief summary of:
    1. The main topics discussed
    2. The apparent interests/focus of this community
    3. Any notable patterns in communication style
    
    Tweets sample:
    {'\n'.join(tweets_sample)}
    """
    
    # Call GPT-4 API and store response
    try:
        response = get_openai_response(prompt, len(prompt))
        community_summaries.append({
            'community': community,
            'size': communities[community],
            'summary': response
        })
            
    except Exception as e:
        print(f"Error processing community {community}: {str(e)}")

# Create DataFrame from summaries
summaries_df = pd.DataFrame(community_summaries)
summaries_df.to_csv('jordan_twittersphere_community_summaries.csv', index=False)

Cross-referencing communities with similarity in content¶

  • we take the content within the top 20 communities.
  • we use an embeddings model to generate embeddings for each tweet
  • we use a similarity metric to cluster the tweets into communities
  • we compare the communities we get from this method with the communities we got from the community detection on the retweet network
In [ ]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import euclidean
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
In [ ]:
#multilingual model to handle english and arabic tweets
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name).to("cuda")
In [ ]:
def get_embedding(text, model):
    with torch.no_grad():
        return model.encode([text], device="cuda")[0]

# Get tweets from top 20 communities
top_20_communities = communities.sort_values(ascending=False).head(20).index
community_tweets = df[df['community'].isin(top_20_communities)]

# Create list to store tweet data with embeddings
tweet_embeddings = []

# Process tweets in batches
for idx, row in community_tweets.iterrows():
    try:
        # Get embedding for cleaned text
        embedding = get_embedding(row['text_cleaned'], model)
        
        # Store tweet data with embedding
        tweet_embeddings.append({
            'tweet_id': row.name,
            'text': row['text_cleaned'],
            'community': row['community'],
            'embedding': embedding
        })
        
        # Print progress every 100 tweets
        if len(tweet_embeddings) % 100 == 0:
            print(f"Processed {len(tweet_embeddings)} tweets")
            
    except Exception as e:
        print(f"Error processing tweet {row.name}: {str(e)}")
        continue

# Convert to DataFrame
embeddings_df = pd.DataFrame(tweet_embeddings)


def calculate_avg_embedding(group):
    return np.mean(np.stack(group['embedding'].values), axis=0)

community_avg_embeddings = embeddings_df.groupby('community').apply(calculate_avg_embedding)


def calculate_similarity(row):
    community_embedding = community_avg_embeddings[row['community']]
    similarity = cosine_similarity([row['embedding']], [community_embedding])[0][0]
    return similarity



tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(np.stack(df['embedding'].values))

plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df['community'], cmap='viridis')
plt.colorbar(scatter)
plt.title('Tweet Embeddings Visualization')
plt.show()